diku-dk / futhark

:boom::computer::boom: A data-parallel functional programming language
http://futhark-lang.org
ISC License
2.4k stars 167 forks source link

Internal compiler error: Type error after pass 'tile loops': #1243

Closed stuebinm closed 3 years ago

stuebinm commented 3 years ago

I'm new to futhark, and stumbled across the following issue:

let DistMatrix [n] [m] (a : [n]f32) (b : [m]f32) : [n][m]f32 = 
    let initial = 0 |> replicate m
                    |> replicate n
    in let outside = (replicate (m+1) f32.inf) with [0] = 0
    in (loop (D, column) = (initial, outside) for i < n do
        let next_row = loop cs = replicate (m+1) f32.inf for j in 1...m do
            cs with [j] = f32.minimum [cs[j-1], column[j]]
        in (D with [i] = (next_row[1:] :> [m]f32), next_row))
    |> \(x,_) -> x

let compute_deltas (d : i64) : [][d]i64 =
    let ret = loop acc = [[]] for i in 1...d do
          flatten (map (\x -> [(x ++ [0]) :> [i]i64, (x ++ [1]) :> [i]i64]) acc)
    in ret :> [][d]i64

let argmin [n] (as : [n]f32) : i64 =
    zip (iota n) as
    |> reduce (\(i1,v1) (i2,v2) -> if v1 < v2 then (i1,v1) else (i2,v2)) (0,f32.inf)
    |> \(i,v) -> i

let parallelDTW [d] [n] (s : [d][n]f32) : [][d]i64 =
    let tri = loop acc = [] for i < d do
              acc ++ zip (replicate i i) (iota i)
    in let D = tri |> map (\(a,b) -> DistMatrix s[b] s[a])
    in let deltas = compute_deltas d in
    loop matching = [replicate d (n-1)] while (i64.maximum <| last matching) != 0 do
        let costs = deltas
            |> map (\pos -> f32.sum (map (\(e1,e2) -> D[0,pos[e1],pos[e2]]) tri))
        in
        matching ++ [deltas[argmin costs]]

let main : [][]i64 = parallelDTW [[1,2,3,4],[4,5,6,7]]

(I've minimised the program as much as I could; whatever the error is, it appears to be fickle, and is prone to disappearing when i replace e.g. let-bindings with default values).

The code yields an internal compiler error when trying to compile with cuda or opencl (or pyopencl), but not when compiling to c (using the latest version from git, compiled with nix via the provided default.nix on nixos-stable):

Internal compiler error.
Please report this at https://github.com/diku-dk/futhark/issues.
Type error after pass 'tile loops':
In expression of statement {[size_7309][4i64][4i64]f32 defunc_0_f_res_7326}
in true branch
In expression of statement {[size_7309][4i64][4i64]f32 defunc_0_f_res_7508}
In expression of statement {[segmap_group_size_7505][4i64][4i64]f32 tiled_inside_loop_8810,
 [segmap_group_size_7505][5i64]f32 tiled_inside_loop_8811}
Inside the loop body
In expression of statement {[segmap_group_size_7505][5i64]f32 tiled_inside_loop_8786}
Inside the loop body
In expression of statement {f32 arr_elem_7531}
Use of unknown variable column_7516.

Interestingly, the entire program works just fine if I replace the f32.minimum in DistMatrix with an equivalent expression using foldr (but not reduce).

athas commented 3 years ago

This is (yet another) bug in one of the most tricky optimisation passes. Minimised example:

let DistMatrix [n] [m] (a : [n]f32) (b : [m]f32) : [n][m]f32 =
    let initial = replicate n (replicate m 0)
    let outside = (replicate (m+1) f32.inf) with [0] = 0
    in (loop (D, column) = (initial, outside) for i < n do
        let next_row = loop cs = replicate (m+1) f32.inf for j in 1...m do
                         cs with [j] = f32.minimum [cs[j-1], column[j]]
        in (D with [i] = (next_row[1:] :> [m]f32), next_row))
    |> \(x,_) -> x

let main [d] [n] (s : [d][n]f32) tri =
  map (\(a,b) -> DistMatrix s[b] s[a]) tri

Probably related to an attempt to tile inside doubly-nested loops. No idea whether it makes sense to even try.

athas commented 3 years ago

Although note that calling f32.minimum on a two-element array is very wasteful. Just use f32.min cs[j-1] column[j].

stuebinm commented 3 years ago

a thanks for the tip, I hadn't noticed that function yet (though in my original code it was a slightly longer array, just shortened it for the issue report here)

athas commented 3 years ago

Generally, the Futhark compiler will assume that most such functions are given "large" arrays and move heaven and earth to parallelise them. This can result in (far) more overhead than is really justified, like here (although it still shouldn't crash). It doesn't understand that some are small (even if it's a constant). That's certainly a compiler weakness, but as a consequence, good practice is to use functions such as foldl or manual loops when you know the arrays are going to be small (or just unroll manually for particularly small arrays, like this one).