inducer / loopy

A code generator for array-based code on CPUs and GPUs
http://mathema.tician.de/software/loopy
MIT License
580 stars 70 forks source link

check_implemented_domains fails #730

Closed isuruf closed 1 year ago

isuruf commented 1 year ago

Following kernel fails with

loopy.diagnostic.LoopyError: sanity check failed--implemented and desired domain for instruction 'write_result' do not match

implemented: [ntgt_boxes, itgt_end, itgt_start] -> { [itgt_box, itgt, iknl] : 0 <= itgt_box < ntgt_boxes and itgt_start <= itgt < itgt_end }

desired:[ntgt_boxes, itgt_end, itgt_start] -> { [itgt_box, itgt, iknl] : 0 <= itgt_box < ntgt_boxes and itgt_start <= itgt < itgt_end and 0 <= iknl <= 1 }

sample point in implemented but not desired: itgt=0, itgt_box=0, ntgt_boxes=1, itgt_end=1, itgt_start=0, iknl=2
gist of constraints in implemented but not desired: [ntgt_boxes, itgt_end, itgt_start] -> { [itgt_box, itgt, iknl] : 0 <= iknl <= 1 }

loopy kernel:

```python import loopy as lp import numpy as np from pymbolic.primitives import * import immutables e2p_from_csr_knl = lp.make_kernel( [ "[ntgt_boxes] -> { [itgt_box] : 0 <= itgt_box < ntgt_boxes }", "[itgt_end, itgt_start] -> { [itgt] : itgt_start <= itgt < itgt_end }", "[isrc_box_end, isrc_box_start] -> { [isrc_box] : isrc_box_start <= isrc_box < isrc_box_end }", "{ [idim, idim_0] : 0 <= idim <= 1 and 0 <= idim_0 <= 1 }", "{ [icoeff_0] : 0 <= icoeff_0 <= 2 }", "{ [iknl, iknl_0] : 0 <= iknl <= 1 and 0 <= iknl_0 <= 1 }", "{ [e2p_idim] : 0 <= e2p_idim <= 1 }", ], ''' for itgt_box tgt_ibox = target_boxes[itgt_box] {id=insn} itgt_start = box_target_starts[tgt_ibox] {id=insn_0, dep=insn} itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] {id=insn_1, dep=insn:insn_0} for itgt tgt[idim] = targets[idim, itgt] {id=insn_2} isrc_box_start = source_box_starts[itgt_box] {id=insn_3} isrc_box_end = source_box_starts[itgt_box + 1] {id=insn_4} result_temp[iknl_0] = 0 {id=init_result} for isrc_box src_ibox = source_box_lists[isrc_box] {id=insn_5} coeffs[icoeff_0] = src_expansions[src_ibox + (-1)*src_base_ibox, icoeff_0] {id=fetch_coeffs, dep=insn_5} center[idim_0] = centers[idim_0, src_ibox] {id=fetch_center, dep=insn_5} ... nop {id=e2p__start, dep=fetch_coeffs:insn_2:init_result:fetch_center} e2p_kernel_scaling = ((-1) / 2)*3.141592653589793**(-1) {id=e2p_insn, dep=e2p__start} e2p_b[e2p_idim] = tgt[e2p_idim] + (-1)*center[e2p_idim] {id=e2p_insn_0, dep=e2p__start} e2p_expr_4 = 1 / rscale {id=e2p_insn_1, dep=e2p__start} e2p_expr_0 = e2p_b[0]*e2p_b[0] + e2p_b[1]*e2p_b[1] {id=e2p_insn_2, dep=e2p_insn_0:e2p__start} e2p_expr_1 = rscale*(1 / e2p_expr_0) {id=e2p_insn_3, dep=e2p__start:e2p_insn_2} e2p_expr_2 = e2p_b[1]*coeffs[2] {id=e2p_insn_4, dep=e2p_insn_0:e2p__start} e2p_expr_3 = e2p_b[0]*e2p_expr_1 {id=e2p_insn_5, dep=e2p_insn_0:e2p_insn_3:e2p__start} e2p_temp_2 = e2p_b[0]*e2p_expr_4 {id=e2p_insn_6, dep=e2p_insn_0:e2p_insn_1:e2p__start} e2p_temp_0 = e2p_b[1]*e2p_expr_4 {id=e2p_insn_7, dep=e2p_insn_0:e2p_insn_1:e2p__start} e2p_cse_exprvar = e2p_temp_0*e2p_temp_0 + e2p_temp_2*e2p_temp_2 {id=e2p_insn_8, dep=e2p_insn_6:e2p__start:e2p_insn_7} e2p_cse_exprvar_0 = sqrt(e2p_cse_exprvar) {id=e2p_insn_9, dep=e2p_insn_8:e2p__start} e2p_temp_1 = e2p_cse_exprvar_0 {id=e2p_insn_10, dep=e2p_insn_9:e2p__start} e2p_cse_exprvar_1 = sqrt(e2p_expr_0) {id=e2p_insn_11, dep=e2p__start:e2p_insn_2} result_temp[0] = result_temp[0] + e2p_kernel_scaling*(coeffs[0]*log(e2p_cse_exprvar_1) + e2p_expr_3*coeffs[1] + e2p_expr_2*e2p_expr_1) {id=e2p_result_0, dep=e2p_insn:e2p_insn_4:e2p_insn_5:e2p_insn_11:e2p_insn_3:e2p__start} e2p_temp_5 = e2p_expr_4 {id=e2p_insn_12, dep=e2p_insn_1:e2p__start} e2p_cse_exprvar_2 = 1 / e2p_temp_1 {id=e2p_insn_13, dep=e2p__start:e2p_insn_10} e2p_temp_6 = (1 + (-2)*e2p_temp_2*e2p_expr_3)*e2p_cse_exprvar_2*e2p_cse_exprvar_2 {id=e2p_insn_14, dep=e2p__start:e2p_insn_5:e2p_insn_13:e2p_insn_6} e2p_cse_exprvar_3 = 1 / e2p_expr_0 {id=e2p_insn_15, dep=e2p__start:e2p_insn_2} result_temp[1] = result_temp[1] + e2p_kernel_scaling*(e2p_expr_3*e2p_temp_5*coeffs[0] + e2p_temp_5*e2p_temp_6*coeffs[1] + (-2)*e2p_b[0]*e2p_expr_2*e2p_temp_5*rscale*rscale*e2p_cse_exprvar_3*e2p_cse_exprvar_3) {id=e2p_result_1, dep=e2p_insn_0:e2p_insn:e2p_insn_4:e2p_insn_5:e2p_insn_14:e2p_insn_15:e2p__start:e2p_insn_12} ... nop {id=update_result, dep=e2p_result_1:e2p_insn_7:e2p_insn_2:e2p_insn_5:e2p_insn_15:e2p_insn_12:e2p_insn_4:e2p_insn_11:e2p_insn_3:e2p_insn_9:e2p_result_0:e2p_insn_1:e2p_insn_6:e2p_insn:e2p_insn_8:e2p_insn_14:e2p_insn_10:e2p_insn_13:e2p_insn_0} end result[iknl, itgt] = result_temp[iknl] {id=write_result, dep=update_result:init_result} end end ''', [ lp.GlobalArg( name="targets", dtype=np.float64, shape=(2, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="box_target_starts", dtype=np.uint32, shape=None, for_atomic=False), lp.GlobalArg( name="box_target_counts_nonchild", dtype=np.uint32, shape=None, for_atomic=False), lp.GlobalArg( name="centers", dtype=np.float64, shape=(2, Variable('aligned_nboxes')), for_atomic=False), lp.GlobalArg( name="src_expansions", dtype=np.float64, shape=(Variable('nsrc_level_boxes'), 3), for_atomic=False), lp.ValueArg( name="src_base_ibox", dtype=np.int32), lp.ValueArg( name="nsrc_level_boxes", dtype=np.int32), lp.ValueArg( name="aligned_nboxes", dtype=np.int32), lp.ValueArg( name="ntargets", dtype=np.int32), lp.GlobalArg( name="result", dtype=np.float64, shape=(2, Variable('ntargets')), for_atomic=False), lp.GlobalArg( name="source_box_starts", dtype=np.uint32, shape=None, for_atomic=False), lp.GlobalArg( name="source_box_lists", dtype=np.uint32, shape=None, for_atomic=False), lp.ValueArg( name="ntgt_boxes", dtype=np.int32), lp.ValueArg( name="rscale", dtype=np.float64), lp.GlobalArg( name="target_boxes", dtype=np.uint32, shape=(Variable('ntgt_boxes'),), for_atomic=False), lp.TemporaryVariable( name="tgt_ibox", dtype=np.uint32, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="itgt_start", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="itgt_end", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="tgt", dtype=np.float64, shape=(2,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="isrc_box_start", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="isrc_box_end", dtype=np.int32, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="result_temp", dtype=np.float64, shape=(2,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="src_ibox", dtype=np.uint32, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="coeffs", dtype=np.float64, shape=(3,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="center", dtype=np.float64, shape=(2,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_cse_exprvar", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_cse_exprvar_0", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_cse_exprvar_1", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_cse_exprvar_2", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_cse_exprvar_3", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_kernel_scaling", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_b", dtype=np.float64, shape=(2,), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_expr_4", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_expr_0", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_expr_1", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_expr_2", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_expr_3", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_temp_2", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_temp_0", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_temp_1", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_temp_5", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), lp.TemporaryVariable( name="e2p_temp_6", dtype=np.float64, shape=(), for_atomic=False, address_space=lp.auto, read_only=False, ), ], lang_version=(2018, 2), name="e2p_from_csr", ) e2p_from_csr_knl = lp.tag_inames(e2p_from_csr_knl, "e2p_idim:unr") e2p_from_csr_knl = lp.tag_inames(e2p_from_csr_knl, "iknl:unr") e2p_from_csr_knl = lp.tag_inames(e2p_from_csr_knl, "idim:unr") e2p_from_csr_knl = lp.tag_inames(e2p_from_csr_knl, "iknl_0:unr") e2p_from_csr_knl = lp.tag_inames(e2p_from_csr_knl, "idim_0:unr") e2p_from_csr_knl = lp.tag_inames(e2p_from_csr_knl, "itgt_box:g.0") t_unit = lp.merge([e2p_from_csr_knl]) lp.generate_code_v2(t_unit).device_code() ```
inducer commented 1 year ago

Turns out this is https://github.com/inducer/islpy/pull/103/ in disguise. See #768.

isuruf commented 1 year ago

Thanks @inducer for tracking this down to https://github.com/inducer/islpy/pull/103 and for the PR