halide / Halide

a language for fast, portable data-parallel computation
https://halide-lang.org
Other
5.78k stars 1.07k forks source link

Internal Error: Condition failed: bounds.find(*it) != bounds.end() #8312

Closed jansel closed 6 days ago

jansel commented 1 week ago

Repro:

import halide as hl

@hl.generator(name="kernel")
class Kernel:
    in_ptr0 = hl.InputBuffer(hl.Int(64), 1)
    in_ptr1 = hl.InputBuffer(hl.Float(32), 1)
    out_ptr0 = hl.OutputBuffer(hl.Float(32), 2)

    def generate(g):
        in_ptr0 = g.in_ptr0
        in_ptr1 = g.in_ptr1
        out_ptr0 = g.out_ptr0
        h0 = hl.Var("h0")
        odom = hl.RDom([hl.Range(0, 4)])
        ho0 = odom[0]
        tmp0 = in_ptr0[0,]
        tmp1 = 8
        tmp2 = tmp0 + tmp1
        tmp3 = tmp0 < 0
        tmp4 = hl.select(tmp3, tmp2, hl.cast(tmp2.type(), tmp0))
        tmp5 = hl.cast(hl.Int(32), tmp4)
        tmp6 = hl.clamp(tmp5, 0, 7)
        tmp7 = hl.Func("tmp7")
        tmp7[h0] = in_ptr1[h0,]
        out_ptr0[hl.Var(), hl.Var()] = hl.undef(out_ptr0.type())
        out_ptr0[
            ho0,
            tmp6,
        ] = hl.cast(hl.Float(32), tmp7[ho0])

        assert g.using_autoscheduler()
        in_ptr0.dim(0).set_min(0)
        in_ptr0.dim(0).set_stride(1)
        in_ptr0.dim(0).set_extent(1)
        in_ptr0.set_estimates([hl.Range(0, 1)])
        in_ptr1.dim(0).set_min(0)
        in_ptr1.dim(0).set_stride(1)
        in_ptr1.dim(0).set_extent(4)
        in_ptr1.set_estimates([hl.Range(0, 4)])
        out_ptr0.set_estimates([hl.Range(0, 4), hl.Range(0, 8)])

if __name__ == "__main__":
    import sys, tempfile

    with tempfile.TemporaryDirectory() as out:
        sys.argv = [
            "repro.py",
            "-g",
            "kernel",
            "-o",
            out,
            "-f",
            "halide_kernel",
            "-e",
            "static_library,h,schedule",
            "-p",
            "/home/jansel/conda/envs/pytorch/lib/libautoschedule_li2018.so",
            "target=host-cuda-cuda_capability_86-user_context-strict_float-no_runtime-no_asserts",
            "autoscheduler=Li2018",
            "autoscheduler.parallelism=82",
        ]
        hl.main()

On CPU: this works

On CUDA with Li2018 autoscheduler:

Unhandled exception: Internal Error at /home/jansel/Halide/src/DerivativeUtils.cpp:256 triggered by user code at : Condition failed: bounds.find(*it) != bounds.end(): 

Traceback (most recent call last):
  File "/home/jansel/pytorch/repro.py", line 64, in <module>
    hl.main()
RuntimeError: Generator failed: -1

On CUDA with Anderson2021 autoscheduler: segfault when called

abadams commented 1 week ago

This is happening because Li2018 isn't considering all Exprs a Func uses, just the ones on the RHS, and there's a Func here that is only referred to on a LHS.