halide / Halide

a language for fast, portable data-parallel computation
https://halide-lang.org
Other
5.78k stars 1.07k forks source link

terminate called after throwing an instance of 'Halide::Error' (from 0D indirect index) #8338

Open jansel opened 3 days ago

jansel commented 3 days ago

Repro:

import halide as hl

@hl.generator(name="kernel")
class Kernel:
    in_ptr0 = hl.InputBuffer(hl.Int(64), 0)
    in_ptr1 = hl.InputBuffer(hl.Float(32), 2)
    out_ptr0 = hl.OutputBuffer(hl.Float(32), 1)

    def generate(g):
        in_ptr0 = g.in_ptr0
        in_ptr1 = g.in_ptr1
        out_ptr0 = g.out_ptr0
        h0 = hl.Var("h0")
        tmp6 = hl.Func("tmp6")
        tmp6[()] = hl.clamp(hl.cast(hl.Int(32), in_ptr0[()]), 0, 7)
        out_ptr0[h0,] = in_ptr1[h0, tmp6[()]]

        assert g.using_autoscheduler()
        in_ptr0.set_estimates([])
        in_ptr1.dim(0).set_min(0)
        in_ptr1.dim(0).set_stride(1)
        in_ptr1.dim(0).set_extent(4)
        in_ptr1.dim(1).set_min(0)
        in_ptr1.dim(1).set_stride(4)
        in_ptr1.dim(1).set_extent(8)
        in_ptr1.set_estimates([hl.Range(0, 4), hl.Range(0, 8)])
        out_ptr0.set_estimates([hl.Range(0, 4)])

if __name__ == "__main__":
    import sys, tempfile

    with tempfile.TemporaryDirectory() as out:
        sys.argv = [
            "repro.py",
            "-g", "kernel",
            "-o", out,
            "-f", "halide_kernel",
            "-e", "static_library,h,schedule",
            "-p", "/home/jansel/conda/envs/pytorch/lib/libautoschedule_adams2019.so",
            "target=host-strict_float-no_runtime-no_asserts",
            "autoscheduler=Adams2019",
            "autoscheduler.parallelism=8",
        ]
        hl.main()

Output:

terminate called after throwing an instance of 'Halide::Error'
zsh: IOT instruction (core dumped)  python repro.py

It works if I make in_ptr0 1D (rather than 0D):

diff --git a/repro.py b/repro.py
index c5397a9c0a5..52785bda978 100644
--- a/repro.py
+++ b/repro.py
@@ -3,7 +3,7 @@ import halide as hl

 @hl.generator(name="kernel")
 class Kernel:
-    in_ptr0 = hl.InputBuffer(hl.Int(64), 0)
+    in_ptr0 = hl.InputBuffer(hl.Int(64), 1)
     in_ptr1 = hl.InputBuffer(hl.Float(32), 2)
     out_ptr0 = hl.OutputBuffer(hl.Float(32), 1)

@@ -13,11 +13,11 @@ class Kernel:
         out_ptr0 = g.out_ptr0
         h0 = hl.Var("h0")
         tmp6 = hl.Func("tmp6")
-        tmp6[()] = hl.clamp(hl.cast(hl.Int(32), in_ptr0[()]), 0, 7)
+        tmp6[()] = hl.clamp(hl.cast(hl.Int(32), in_ptr0[0]), 0, 7)
         out_ptr0[h0,] = in_ptr1[h0, tmp6[()]]

         assert g.using_autoscheduler()
-        in_ptr0.set_estimates([])
+        in_ptr0.set_estimates([hl.Range(0, 1)])
         in_ptr1.dim(0).set_min(0)
         in_ptr1.dim(0).set_stride(1)
         in_ptr1.dim(0).set_extent(4)