halide / Halide

a language for fast, portable data-parallel computation
https://halide-lang.org
Other
5.78k stars 1.07k forks source link

Invalid LLVM error from hl.is_inf(float16) #8309

Closed jansel closed 1 week ago

jansel commented 1 week ago

Repro:

import halide as hl

@hl.generator(name="kernel")
class Kernel:
    in_ptr0 = hl.InputBuffer(hl.Float(16), 2)
    out_ptr3 = hl.OutputBuffer(hl.Bool(), 1)

    def generate(g):
        in_ptr0 = g.in_ptr0
        out_ptr3 = g.out_ptr3
        h0 = hl.Var("h0")
        h1 = hl.Var("h1")
        rdom = hl.RDom([hl.Range(0, 768)])
        hr1 = rdom[0]
        tmp0 = hl.Func("tmp0")
        tmp0[h0, h1] = hl.cast(
            hl.Float(16),
            in_ptr0[
                h0,
                h1,
            ],
        )
        tmp27 = hl.Func("tmp27")
        tmp27[h0, h1] = hl.is_inf(tmp0[h0, h1])
        tmp28 = hl.Func("tmp28")
        tmp28[h1] = hl.maximum(rdom, tmp27[hr1, h1])
        out_ptr3[h1,] = hl.cast(hl.Bool(), tmp28[h1])

        assert g.using_autoscheduler()
        in_ptr0.dim(0).set_min(0)
        in_ptr0.dim(0).set_stride(1)
        in_ptr0.dim(0).set_extent(768)
        in_ptr0.dim(1).set_min(0)
        in_ptr0.dim(1).set_stride(768)
        in_ptr0.dim(1).set_extent(512)
        in_ptr0.set_estimates([hl.Range(0, 768), hl.Range(0, 512)])
        out_ptr3.set_estimates([hl.Range(0, 512)])

if __name__ == "__main__":
    import sys, tempfile

    with tempfile.TemporaryDirectory() as out:
        sys.argv = [
            "repro.py",
            "-g",
            "kernel",
            "-o",
            out,
            "-f",
            "halide_kernel",
            "-e",
            "static_library,h,schedule",
            "-p",
            "/home/jansel/conda/envs/pytorch/lib/libautoschedule_anderson2021.so",
            "target=host-cuda-cuda_capability_86-user_context-strict_float-no_runtime-no_asserts",
            "autoscheduler=Anderson2021",
            "autoscheduler.parallelism=82",
        ]
        hl.main()

else:
    hl.load_plugin(
        "/home/jansel/conda/envs/pytorch/lib/libautoschedule_anderson2021.so"
    )
    target = hl.Target(
        "host-cuda-cuda_capability_86-user_context-strict_float-no_runtime-no_asserts"
    )
    autoscheduler = hl.AutoschedulerParams("Anderson2021", {"parallelism": 82})
    with hl.GeneratorContext(target, autoscheduler):
        gen = Kernel()
        pipeline = gen._build_pipeline()
        # gen.compile_to_callable() does not run the autoscheduler
        pipeline.apply_autoscheduler(target, autoscheduler)
        kernel = pipeline.compile_to_callable(
            [
                gen._get_input_parameter(a.name)._to_argument()
                for a in gen._get_arginfos()
                if a.dir == hl.ArgInfoDirection.Input
            ],
            target,
        )

Output:

Unhandled exception: Internal Error at /home/jansel/Halide/src/CodeGen_LLVM.cpp:1336 triggered by user code at : Condition failed: types_match: Codegen of Expr (float32)is_inf_f32((float32)strict_float(reinterpret<float32>((uint32)bitwise_or((uint32)shift_left(uint32((uint16)bitwise_and((uint16)t266, (uint16)32768)), (uint32)16), select((uint16)t267 == (uint16)0, (uint32)0, select((uint16)t267 < (uint16)1024, reinterpret<uint32>((float32)strict_float(float32((uint16)t267))) - (uint32)201326592, select((uint16)t267 >= (uint16)31744, (uint32)bitwise_or((uint32)t268, (uint32)2139095040), (uint32)t268 + (uint32)939524096))))))) of type float32 did not produce llvm IR of the corresponding llvm type.

Traceback (most recent call last):
  File "/home/jansel/pytorch/repro.py", line 61, in <module>
    hl.main()
RuntimeError: Generator failed: -1

It works with 32 bit however:

diff --git a/repro.py b/repro.py
index 8ce5bd5a9eb..236e4a9784a 100644
--- a/repro.py
+++ b/repro.py
@@ -15,7 +15,7 @@ class Kernel:
         hr1 = rdom[0]
         tmp0 = hl.Func("tmp0")
         tmp0[h0, h1] = hl.cast(
-            hl.Float(16),
+            hl.Float(32),
             in_ptr0[
                 h0,
                 h1,
jansel commented 1 week ago

Looks like hl.is_nan() has the same issue.

abadams commented 1 week ago

Oh I see the bug. It's in EmulateFloat16Math.cpp. It rewrites all float16 transcendentals/intrinsics to their float32 equivalents, but it just assumes these functions return float