diku-dk / futhark

:boom::computer::boom: A data-parallel functional programming language
http://futhark-lang.org
ISC License
2.4k stars 167 forks source link

Disregard copies with identical source and destination index functions as part of gpu impgen #1459

Closed Munksgaard closed 3 years ago

Munksgaard commented 3 years ago

When memory coalescing has been implemented, it will affect programs like the following nw-cosmin.fut:

nw-cosmin.fut ```futhark -- Code and comments based on -- https://github.com/kkushagra/rodinia/blob/master/openmp/nw -- -- == -- entry: nw_flat -- compiled random input { 16i64 10i32 [2362369]i32 [2362369]i32 } auto output -- compiled random input { 32i64 10i32 [2362369]i32 [2362369]i32 } auto output -- compiled random input { 64i64 10i32 [2362369]i32 [2362369]i32 } auto output let flat_index_2d [n] 'a (as: [n]a) (offset: i64) (n1: i64) (s1: i64) (n2: i64) (s2: i64) : [n1][n2]a = intrinsics.flat_index_2d(as, offset, n1, s1, n2, s2) :> [n1][n2]a let flat_index_3d [n] 'a (as: [n]a) (offset: i64) (n1: i64) (s1: i64) (n2: i64) (s2: i64) (n3: i64) (s3: i64) : [n1][n2][n3]a = intrinsics.flat_index_3d(as, offset, n1, s1, n2, s2, n3, s3) :> [n1][n2][n3]a let flat_update_3d [n][k][l][p] 'a (as: *[n]a) (offset: i64) (s1: i64) (s2: i64) (s3: i64) (asss: [k][l][p]a) : *[n]a = intrinsics.flat_update_3d(as, offset, s1, s2, s3, asss) let mkVal [bp1][b] (y:i32) (x:i32) (pen:i32) (block:[bp1][bp1]i32) (ref:[b][b]i32) : i32 = #[unsafe] i32.max (block[y, x - 1] - pen) (block[y - 1, x] - pen) |> i32.max (block[y - 1, x - 1] + ref[y - 1, x - 1]) let process_block [b][bp1] (penalty: i32) (above: [bp1]i32) (left: [b]i32) (ref: [b][b]i32): *[b][b]i32 = let block = assert (b + 1 == bp1) (tabulate_2d bp1 bp1 (\_ _ -> 0)) let block[0, 0:] = above let block[1:, 0] = left -- Process the first half (anti-diagonally) of the block let block = loop block for m < b do let inds = tabulate b (\tx -> if tx > m then (-1, -1) else let ind_x = i32.i64 (tx + 1) let ind_y = i32.i64 (m - tx + 1) in (i64.i32 ind_y, i64.i32 ind_x)) let vals = -- tabulate over the m'th anti-diagonal before the middle tabulate b (\tx -> if tx > m then 0 else let ind_x = i32.i64 (tx + 1) let ind_y = i32.i64 (m - tx + 1) let v = mkVal ind_y ind_x penalty block ref in v) in scatter_2d block inds vals -- Process the second half (anti-diagonally) of the block let block = loop block for m < b-1 do let m = b - 2 - m let inds = tabulate b (\tx -> ( if tx > m then (-1, -1) else let ind_x = i32.i64 (tx + b - m) let ind_y = i32.i64 (b - tx) in ((i64.i32 ind_y, i64.i32 ind_x)) ) ) let vals = -- tabulate over the m'th anti-diagonal after the middle tabulate b (\tx -> ( if tx > m then (0) else let ind_x = i32.i64 (tx + b - m) let ind_y = i32.i64 (b - tx) let v = mkVal ind_y ind_x penalty block ref in v )) in scatter_2d block inds vals in block[1:, 1:] :> [b][b]i32 entry nw_flat [n] (block_size: i64) (penalty: i32) (input: *[n]i32) (refs: [n]i32) : *[n]i32 = let row_length = i64.f64 <| f64.sqrt <| f64.i64 n let num_blocks = -- assert ((row_length - 1) % b == 0) <| (row_length - 1) / block_size let bp1 = block_size + 1 let input = loop input for i < num_blocks do let ip1 = i + 1 let v = #[incremental_flattening(only_intra)] map3 (process_block penalty) (flat_index_2d input (i * block_size) ip1 (row_length * block_size - block_size) bp1 1) (flat_index_2d input (row_length + i * block_size) ip1 (row_length * block_size - block_size) block_size row_length) (flat_index_3d refs (row_length + 1 + i * block_size) ip1 (row_length * block_size - block_size) block_size row_length block_size 1i64) in flat_update_3d input (row_length + 1 + i * block_size) (row_length * block_size - block_size) (row_length) 1 v let input = loop input for i < num_blocks - 1 do let v = #[incremental_flattening(only_intra)] map3 (process_block penalty) (flat_index_2d input (((i + 1) * block_size + 1) * row_length - block_size - 1) (num_blocks - i - 1) (row_length * block_size - block_size) bp1 1i64) (flat_index_2d input (((i + 1) * block_size + 1) * row_length - block_size - 1 + row_length) (num_blocks - i - 1) (row_length * block_size - block_size) block_size row_length) (flat_index_3d refs (((i + 1) * block_size + 2) * row_length - block_size) (num_blocks - i - 1) (row_length * block_size - block_size) block_size row_length block_size 1i64) in flat_update_3d input (((i + 1) * block_size + 2) * row_length - block_size) (row_length * block_size - block_size) (row_length) 1 v in input ```

In particular, the goal is to perform the final copy after mapping process_block directly in to the output memory space instead of having an intermediate global memory array.

In GPUMem, it means that the result of the two segmap-group calls will be written directly into the output memory:

@@ -233,11 +233,11 @@ entry("nw_flat",
       let {mem_12915 : mem} =
         #[incremental_flattening(only_intra)]
         alloc(bytes_12914)
-      let {defunc_5_map_res_12592 : [ip1_12210][block_size_12165][block_size_12165]i32 @ mem_12915 ->
+      let {defunc_5_map_res_12592 : [ip1_12210][block_size_12165][block_size_12165]i32 @ input_mem_12853 ->
                                     {base: [ip1_12210, block_size_12165,
                                             block_size_12165]; contiguous: true;
-                                     LMADs: [{offset: 0i64;
-                                              strides: [mul_nw64 (block_size_12165) (block_size_12165), block_size_12165, 1i64];
+                                     LMADs: [{offset: flat_index_3d_arg_12212;
+                                              strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171, 1i64];
                                               rotates: [0i64, 0i64, 0i64];
                                               shape: [ip1_12210, block_size_12165, block_size_12165];
                                               permutation: [0, 1, 2];
@@ -699,11 +699,11 @@ entry("nw_flat",
       let {mem_12979 : mem} =
         #[incremental_flattening(only_intra)]
         alloc(bytes_12978)
-      let {defunc_5_map_res_12745 : [flat_index_3d_arg_12373][block_size_12165][block_size_12165]i32 @ mem_12979 ->
+      let {defunc_5_map_res_12745 : [flat_index_3d_arg_12373][block_size_12165][block_size_12165]i32 @ input_mem_12853 ->
                                     {base: [flat_index_3d_arg_12373,
                                             block_size_12165, block_size_12165];
-                                     contiguous: true; LMADs: [{offset: 0i64;
-                                                                strides: [mul_nw64 (block_size_12165) (block_size_12165), block_size_12165, 1i64];
+                                     contiguous: true; LMADs: [{offset: flat_index_3d_arg_12378;
+                                                                strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171, 1i64];
                                                                 rotates: [0i64, 0i64, 0i64];
                                                                 shape: [flat_index_3d_arg_12373, block_size_12165, block_size_12165];
                                                                 permutation: [0, 1, 2];

For completeness, here is the entire generated nw-cosmin.fut_gpu_mem:

nw-cosmin.fut_gpu_mem ```futhark entry("nw_flat", {direct, direct, *direct, direct}, {*direct}) entry_nw_flat (input_mem_12853 : mem, refs_mem_12854 : mem, n_12164 : i64, block_size_12165 : i64, penalty_12166 : i32, input_12167 : *[n_12164]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}, refs_12168 : [n_12164]i32 @ refs_mem_12854 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}) : {mem, *[n_12164]i32 @ ?0-> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = { let {i64_res_12169 : f64} = sitofp i64 n_12164 to f64 let {defunc_0_f_res_12170 : f64} = apply sqrt64(i64_res_12169) : {f64} let {defunc_0_f_res_12171 : i64} = fptosi f64 defunc_0_f_res_12170 to i64 let {x_12172 : i64} = sub64(defunc_0_f_res_12171, 1i64) let {zero_12173 : bool} = eq_i64(block_size_12165, 0i64) let {nonzero_12174 : bool} = not zero_12173 let {nonzero_cert_12175 : unit} = assert(nonzero_12174, {"division by zero"}, "tests/slice-lmads/nw-cosmin.fut:111:20-48") let {num_blocks_12176 : i64} = #{nonzero_cert_12175} sdiv64(x_12172, block_size_12165) let {bp1_12177 : i64} = add64(1i64, block_size_12165) let {x_12178 : i64} = mul64(block_size_12165, defunc_0_f_res_12171) let {flat_index_3d_arg_12179 : i64} = sub64(x_12178, block_size_12165) let {x_12180 : i64} = add64(1i64, defunc_0_f_res_12171) let {loop_nonempty_12181 : bool} = slt64(0i64, num_blocks_12176) let {loop_not_taken_12182 : bool} = not loop_nonempty_12181 let {y_12183 : bool} = slt64(0i64, bp1_12177) let {empty_slice_12184 : bool} = eq_i64(bp1_12177, 0i64) let {zero_leq_i_p_m_t_s_12185 : bool} = sle64(0i64, block_size_12165) let {i_p_m_t_s_leq_w_12186 : bool} = slt64(block_size_12165, bp1_12177) let {i_lte_j_12187 : bool} = sle64(0i64, bp1_12177) let {y_12188 : bool} = logand(zero_leq_i_p_m_t_s_12185, i_p_m_t_s_leq_w_12186) let {y_12189 : bool} = logand(i_lte_j_12187, y_12188) let {ok_or_empty_12190 : bool} = logor(empty_slice_12184, y_12189) let {index_ok_12191 : bool} = logand(y_12183, ok_or_empty_12190) let {protect_assert_disj_12192 : bool} = logor(loop_not_taken_12182, index_ok_12191) let {index_certs_12193 : unit} = assert(protect_assert_disj_12192, {"Index [", 0i64 : i64, ", ", 0i64 : i64, ":] out of bounds for array of shape [", bp1_12177 : i64, "][", bp1_12177 : i64, "]."}, "tests/slice-lmads/nw-cosmin.fut:59:3-101:31") let {i_lte_j_12194 : bool} = sle64(1i64, bp1_12177) let {y_12195 : bool} = logand(y_12188, i_lte_j_12194) let {ok_or_empty_12196 : bool} = logor(zero_12173, y_12195) let {index_ok_12197 : bool} = logand(y_12183, ok_or_empty_12196) let {protect_assert_disj_12198 : bool} = logor(loop_not_taken_12182, index_ok_12197) let {index_certs_12199 : unit} = assert(protect_assert_disj_12198, {"Index [", 1i64 : i64, ":, ", 0i64 : i64, "] out of bounds for array of shape [", bp1_12177 : i64, "][", bp1_12177 : i64, "]."}, "tests/slice-lmads/nw-cosmin.fut:60:3-101:31") let {upper_bound_12200 : i64} = sub64(block_size_12165, 1i64) let {x_12201 : i64} = sub64(block_size_12165, 2i64) let {index_ok_12202 : bool} = logand(ok_or_empty_12196, ok_or_empty_12196) let {protect_assert_disj_12203 : bool} = logor(loop_not_taken_12182, index_ok_12202) let {index_certs_12204 : unit} = assert(protect_assert_disj_12203, {"Index [", 1i64 : i64, ":, ", 1i64 : i64, ":] out of bounds for array of shape [", bp1_12177 : i64, "][", bp1_12177 : i64, "]."}, "tests/slice-lmads/nw-cosmin.fut:101:6-18") let {span_12205 : i64} = mul_nw64(defunc_0_f_res_12171, upper_bound_12200) let {binop_x_12863 : i64} = mul_nw64(bp1_12177, bp1_12177) let {binop_y_12864 : i64} = mul_nw64(4i64, binop_x_12863) let {bytes_12865 : i64} = smax64(0i64, binop_y_12864) let {binop_y_12878 : i64} = mul_nw64(4i64, block_size_12165) let {bytes_12879 : i64} = smax64(0i64, binop_y_12878) let {input_12207 : [n_12164]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = loop {input_12209 : *[n_12164]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = {input_12167} for i_12208:i64 < num_blocks_12176 do { let {ip1_12210 : i64} = add64(1i64, i_12208) let {y_12211 : i64} = mul64(block_size_12165, i_12208) let {flat_index_3d_arg_12212 : i64} = add64(x_12180, y_12211) let {offset_inbounds_down_12213 : bool} = ule64(0i64, flat_index_3d_arg_12212) let {offset_inbounds_up_12214 : bool} = ult64(flat_index_3d_arg_12212, n_12164) let {span_12215 : i64} = mul_nw64(flat_index_3d_arg_12179, i_12208) let {span_and_lower_12216 : i64} = add_nw64(flat_index_3d_arg_12212, span_12215) let {minimum_12217 : i64} = umin64(flat_index_3d_arg_12212, span_and_lower_12216) let {maximum_12218 : i64} = umax64(flat_index_3d_arg_12212, span_and_lower_12216) let {span_and_lower_12219 : i64} = add_nw64(span_12205, minimum_12217) let {span_and_upper_12220 : i64} = add_nw64(span_12205, maximum_12218) let {minimum_12221 : i64} = umin64(minimum_12217, span_and_lower_12219) let {maximum_12222 : i64} = umax64(maximum_12218, span_and_upper_12220) let {span_and_lower_12223 : i64} = add_nw64(upper_bound_12200, minimum_12221) let {span_and_upper_12224 : i64} = add_nw64(upper_bound_12200, maximum_12222) let {minimum_12225 : i64} = umin64(minimum_12221, span_and_lower_12223) let {maximum_12226 : i64} = umax64(maximum_12222, span_and_upper_12224) let {min_in_bounds_12227 : bool} = ule64(0i64, minimum_12225) let {max_in_bounds_12228 : bool} = ult64(maximum_12226, n_12164) let {inBounds_12229 : bool} = logand(offset_inbounds_down_12213, offset_inbounds_up_12214) let {inBounds_12230 : bool} = logand(min_in_bounds_12227, inBounds_12229) let {inBounds_12231 : bool} = logand(max_in_bounds_12228, inBounds_12230) let {bounds_cert_12232 : unit} = assert(inBounds_12231, {"Flat slice out of bounds: n_4488 and [(n1_4492, s1_4493), (n2_4494, s2_4495), (n3_4496, s3_4497)]"}, "tests/slice-lmads/intrinsics.fut:8:3-62") let {flat_index_3d_res_12233 : [ip1_12210][block_size_12165][block_size_12165]i32 @ refs_mem_12854 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: flat_index_3d_arg_12212; strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171, 1i64]; rotates: [0i64, 0i64, 0i64]; shape: [ip1_12210, block_size_12165, block_size_12165]; permutation: [0, 1, 2]; monotonicity: [Unknown, Unknown, Unknown]}]}} = #{bounds_cert_12232} refs_12168[flat_index_3d_arg_12212; ip1_12210 : flat_index_3d_arg_12179, block_size_12165 : defunc_0_f_res_12171, block_size_12165 : 1i64] let {flat_index_2d_arg_12234 : i64} = add64(defunc_0_f_res_12171, y_12211) let {offset_inbounds_down_12235 : bool} = ule64(0i64, flat_index_2d_arg_12234) let {offset_inbounds_up_12236 : bool} = ult64(flat_index_2d_arg_12234, n_12164) let {span_and_lower_12237 : i64} = add_nw64(span_12215, flat_index_2d_arg_12234) let {minimum_12238 : i64} = umin64(flat_index_2d_arg_12234, span_and_lower_12237) let {maximum_12239 : i64} = umax64(flat_index_2d_arg_12234, span_and_lower_12237) let {span_and_lower_12240 : i64} = add_nw64(span_12205, minimum_12238) let {span_and_upper_12241 : i64} = add_nw64(span_12205, maximum_12239) let {minimum_12242 : i64} = umin64(minimum_12238, span_and_lower_12240) let {maximum_12243 : i64} = umax64(maximum_12239, span_and_upper_12241) let {min_in_bounds_12244 : bool} = ule64(0i64, minimum_12242) let {max_in_bounds_12245 : bool} = ult64(maximum_12243, n_12164) let {inBounds_12246 : bool} = logand(offset_inbounds_down_12235, offset_inbounds_up_12236) let {inBounds_12247 : bool} = logand(min_in_bounds_12244, inBounds_12246) let {inBounds_12248 : bool} = logand(max_in_bounds_12245, inBounds_12247) let {bounds_cert_12249 : unit} = assert(inBounds_12248, {"Flat slice out of bounds: n_4457 and [(n1_4461, s1_4462), (n2_4463, s2_4464)]"}, "tests/slice-lmads/intrinsics.fut:2:3-54") let {flat_index_2d_res_12250 : [ip1_12210][block_size_12165]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: flat_index_2d_arg_12234; strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171]; rotates: [0i64, 0i64]; shape: [ip1_12210, block_size_12165]; permutation: [0, 1]; monotonicity: [Unknown, Unknown]}]}} = #{bounds_cert_12249} input_12209[flat_index_2d_arg_12234; ip1_12210 : flat_index_3d_arg_12179, block_size_12165 : defunc_0_f_res_12171] let {offset_inbounds_down_12251 : bool} = ule64(0i64, y_12211) let {offset_inbounds_up_12252 : bool} = ult64(y_12211, n_12164) let {span_and_lower_12253 : i64} = add_nw64(y_12211, span_12215) let {minimum_12254 : i64} = umin64(y_12211, span_and_lower_12253) let {maximum_12255 : i64} = umax64(y_12211, span_and_lower_12253) let {span_and_lower_12256 : i64} = add_nw64(block_size_12165, minimum_12254) let {span_and_upper_12257 : i64} = add_nw64(block_size_12165, maximum_12255) let {minimum_12258 : i64} = umin64(minimum_12254, span_and_lower_12256) let {maximum_12259 : i64} = umax64(maximum_12255, span_and_upper_12257) let {min_in_bounds_12260 : bool} = ule64(0i64, minimum_12258) let {max_in_bounds_12261 : bool} = ult64(maximum_12259, n_12164) let {inBounds_12262 : bool} = logand(offset_inbounds_down_12251, offset_inbounds_up_12252) let {inBounds_12263 : bool} = logand(min_in_bounds_12260, inBounds_12262) let {inBounds_12264 : bool} = logand(max_in_bounds_12261, inBounds_12263) let {bounds_cert_12265 : unit} = assert(inBounds_12264, {"Flat slice out of bounds: n_4457 and [(n1_4461, s1_4462), (n2_4463, s2_4464)]"}, "tests/slice-lmads/intrinsics.fut:2:3-54") let {flat_index_2d_res_12266 : [ip1_12210][bp1_12177]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: y_12211; strides: [flat_index_3d_arg_12179, 1i64]; rotates: [0i64, 0i64]; shape: [ip1_12210, bp1_12177]; permutation: [0, 1]; monotonicity: [Unknown, Unknown]}]}} = #{bounds_cert_12265} input_12209[y_12211; ip1_12210 : flat_index_3d_arg_12179, bp1_12177 : 1i64] let {binop_x_12911 : i64} = #[incremental_flattening(only_intra)] mul_nw64(block_size_12165, ip1_12210) let {binop_x_12912 : i64} = #[incremental_flattening(only_intra)] mul_nw64(block_size_12165, binop_x_12911) let {binop_y_12913 : i64} = #[incremental_flattening(only_intra)] mul_nw64(4i64, binop_x_12912) let {bytes_12914 : i64} = #[incremental_flattening(only_intra)] smax64(0i64, binop_y_12913) let {mem_12915 : mem} = #[incremental_flattening(only_intra)] alloc(bytes_12914) let {defunc_5_map_res_12592 : [ip1_12210][block_size_12165][block_size_12165]i32 @ input_mem_12853 -> {base: [ip1_12210, block_size_12165, block_size_12165]; contiguous: true; LMADs: [{offset: flat_index_3d_arg_12212; strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171, 1i64]; rotates: [0i64, 0i64, 0i64]; shape: [ip1_12210, block_size_12165, block_size_12165]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}} = #[incremental_flattening(only_intra)] segmap(group; #groups=ip1_12210; groupsize=block_size_12165) (gtid_12531 < ip1_12210) (~phys_tid_12591) : {[block_size_12165][block_size_12165]i32} { let {color_12997 : mem@local} = alloc(bytes_12879, @local) let {color_12998 : mem@local} = alloc(bytes_12865, @local) let {x_12593 : [bp1_12177]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: add_nw64 (y_12211) (mul_nw64 (gtid_12531) (flat_index_3d_arg_12179)); strides: [1i64]; rotates: [0i64]; shape: [bp1_12177]; permutation: [0]; monotonicity: [Unknown]}]}} = flat_index_2d_res_12266[gtid_12531, 0i64 :+ bp1_12177 * 1i64] let {x_12595 : [block_size_12165]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: add_nw64 (flat_index_2d_arg_12234) (mul_nw64 (gtid_12531) (flat_index_3d_arg_12179)); strides: [defunc_0_f_res_12171]; rotates: [0i64]; shape: [block_size_12165]; permutation: [0]; monotonicity: [Unknown]}]}} = flat_index_2d_res_12250[gtid_12531, 0i64 :+ block_size_12165 * 1i64] let {defunc_2_map_res_12599 : [bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = replicate([bp1_12177][bp1_12177], 0i32) let {block_12600 : [bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = #{index_certs_12193} defunc_2_map_res_12599 with [0i64, 0i64 :+ bp1_12177 * 1i64] = x_12593 let {block_12601 : [bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = #{index_certs_12199} block_12600 with [1i64 :+ block_size_12165 * 1i64, 0i64] = x_12595 let {block_12602 : [bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = loop {block_12604 : *[bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = {block_12601} for m_12603:i64 < block_size_12165 do { let {defunc_1_map_res_12605 : [block_size_12165]i32 @ color_12997 -> {base: [block_size_12165]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [block_size_12165]; permutation: [0]; monotonicity: [Inc]}]}} = segmap(thread; #groups=ip1_12210; groupsize=block_size_12165) (gtid_12534 < block_size_12165) (~phys_tid_12535) : {i32} { let {cond_12607 : bool} = slt64(m_12603, gtid_12534) let {defunc_0_f_res_12608 : i32} = if cond_12607 then {0i32} else { let {i64_arg_12609 : i64} = add64(1i64, gtid_12534) let {i64_res_12610 : i32} = sext i64 i64_arg_12609 to i32 let {x_12611 : i64} = sub64(m_12603, gtid_12534) let {i64_arg_12612 : i64} = add64(1i64, x_12611) let {i64_res_12613 : i32} = sext i64 i64_arg_12612 to i32 let {i_12614 : i32} = sub32(i64_res_12613, 1i32) let {i_12615 : i64} = sext i32 i_12614 to i64 let {x_12616 : i64} = sext i32 i64_res_12610 to i64 let {x_12617 : i32} = block_12604[i_12615, x_12616] let {max_arg_12618 : i32} = sub32(x_12617, penalty_12166) let {y_12619 : i64} = sext i32 i64_res_12613 to i64 let {i_12620 : i32} = sub32(i64_res_12610, 1i32) let {i_12621 : i64} = sext i32 i_12620 to i64 let {x_12622 : i32} = block_12604[y_12619, i_12621] let {max_arg_12623 : i32} = sub32(x_12622, penalty_12166) let {max_res_12624 : i32} = smax32(max_arg_12618, max_arg_12623) let {x_12625 : i32} = block_12604[i_12615, i_12621] let {y_12626 : i32} = flat_index_3d_res_12233[gtid_12531, i_12615, i_12621] let {max_arg_12627 : i32} = add32(x_12625, y_12626) let {defunc_0_f_res_12628 : i32} = smax32(max_res_12624, max_arg_12627) in {defunc_0_f_res_12628} } : {i32} return {returns defunc_0_f_res_12608} } let {scatter_2d_res_12629 : [bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = segmap(thread; #groups=ip1_12210; groupsize=block_size_12165) (write_i_12559 < block_size_12165) (~phys_tid_12560) : {i32} { let {write_value_12631 : i32} = defunc_1_map_res_12605[write_i_12559] let {cond_12632 : bool} = slt64(m_12603, write_i_12559) let {defunc_0_f_res_12633 : i64, defunc_0_f_res_12634 : i64} = if cond_12632 then {-1i64, -1i64} else { let {i64_arg_12635 : i64} = add64(1i64, write_i_12559) let {i64_res_12636 : i32} = sext i64 i64_arg_12635 to i32 let {x_12637 : i64} = sub64(m_12603, write_i_12559) let {i64_arg_12638 : i64} = add64(1i64, x_12637) let {i64_res_12639 : i32} = sext i64 i64_arg_12638 to i32 let {i32_res_12640 : i64} = sext i32 i64_res_12639 to i64 let {i32_res_12641 : i64} = sext i32 i64_res_12636 to i64 in {i32_res_12640, i32_res_12641} } : {i64, i64} return {block_12604 : [bp1_12177][bp1_12177] with ([defunc_0_f_res_12633, defunc_0_f_res_12634] = write_value_12631)} } in {scatter_2d_res_12629} } let {block_12642 : [bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = loop {block_12644 : *[bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = {block_12602} for m_12643:i64 < upper_bound_12200 do { let {m_12645 : i64} = sub64(x_12201, m_12643) let {defunc_1_map_res_12646 : [block_size_12165]i32 @ color_12997 -> {base: [block_size_12165]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [block_size_12165]; permutation: [0]; monotonicity: [Inc]}]}} = segmap(thread; #groups=ip1_12210; groupsize=block_size_12165) (gtid_12561 < block_size_12165) (~phys_tid_12562) : {i32} { let {cond_12648 : bool} = slt64(m_12645, gtid_12561) let {defunc_0_f_res_12649 : i32} = if cond_12648 then {0i32} else { let {x_12650 : i64} = add64(block_size_12165, gtid_12561) let {i64_arg_12651 : i64} = sub64(x_12650, m_12645) let {i64_res_12652 : i32} = sext i64 i64_arg_12651 to i32 let {i64_arg_12653 : i64} = sub64(block_size_12165, gtid_12561) let {i64_res_12654 : i32} = sext i64 i64_arg_12653 to i32 let {i_12655 : i32} = sub32(i64_res_12654, 1i32) let {i_12656 : i64} = sext i32 i_12655 to i64 let {x_12657 : i64} = sext i32 i64_res_12652 to i64 let {x_12658 : i32} = block_12644[i_12656, x_12657] let {max_arg_12659 : i32} = sub32(x_12658, penalty_12166) let {y_12660 : i64} = sext i32 i64_res_12654 to i64 let {i_12661 : i32} = sub32(i64_res_12652, 1i32) let {i_12662 : i64} = sext i32 i_12661 to i64 let {x_12663 : i32} = block_12644[y_12660, i_12662] let {max_arg_12664 : i32} = sub32(x_12663, penalty_12166) let {max_res_12665 : i32} = smax32(max_arg_12659, max_arg_12664) let {x_12666 : i32} = block_12644[i_12656, i_12662] let {y_12667 : i32} = flat_index_3d_res_12233[gtid_12531, i_12656, i_12662] let {max_arg_12668 : i32} = add32(x_12666, y_12667) let {defunc_0_f_res_12669 : i32} = smax32(max_res_12665, max_arg_12668) in {defunc_0_f_res_12669} } : {i32} return {returns defunc_0_f_res_12649} } let {scatter_2d_res_12670 : [bp1_12177][bp1_12177]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = segmap(thread; #groups=ip1_12210; groupsize=block_size_12165) (write_i_12586 < block_size_12165) (~phys_tid_12587) : {i32} { let {write_value_12672 : i32} = defunc_1_map_res_12646[write_i_12586] let {cond_12673 : bool} = slt64(m_12645, write_i_12586) let {defunc_0_f_res_12674 : i64, defunc_0_f_res_12675 : i64} = if cond_12673 then {-1i64, -1i64} else { let {x_12676 : i64} = add64(block_size_12165, write_i_12586) let {i64_arg_12677 : i64} = sub64(x_12676, m_12645) let {i64_res_12678 : i32} = sext i64 i64_arg_12677 to i32 let {i64_arg_12679 : i64} = sub64(block_size_12165, write_i_12586) let {i64_res_12680 : i32} = sext i64 i64_arg_12679 to i32 let {i32_res_12681 : i64} = sext i32 i64_res_12680 to i64 let {i32_res_12682 : i64} = sext i32 i64_res_12678 to i64 in {i32_res_12681, i32_res_12682} } : {i64, i64} return {block_12644 : [bp1_12177][bp1_12177] with ([defunc_0_f_res_12674, defunc_0_f_res_12675] = write_value_12672)} } in {scatter_2d_res_12670} } let {defunc_2_f_res_12683 : [block_size_12165][block_size_12165]i32 @ color_12998 -> {base: [bp1_12177, bp1_12177]; contiguous: false; LMADs: [{offset: add_nw64 (bp1_12177) (1i64); strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [block_size_12165, block_size_12165]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = #{index_certs_12204} block_12642[1i64 :+ block_size_12165 * 1i64, 1i64 :+ block_size_12165 * 1i64] return {returns defunc_2_f_res_12683} } let {bounds_cert_12358 : unit} = assert(inBounds_12231, {"Flat slice out of bounds: n_4508 and [(k_4509, s1_4515), (l_4510, s2_4516), (p_4511, s3_4517)]"}, "tests/slice-lmads/intrinsics.fut:11:3-57") let {flat_update_3d_res_12359 : [n_12164]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = #{bounds_cert_12358} input_12209 with [flat_index_3d_arg_12212; ip1_12210 : flat_index_3d_arg_12179, block_size_12165 : defunc_0_f_res_12171, block_size_12165 : 1i64] = defunc_5_map_res_12592 in {flat_update_3d_res_12359} } let {upper_bound_12360 : i64} = sub64(num_blocks_12176, 1i64) let {loop_nonempty_12361 : bool} = slt64(0i64, upper_bound_12360) let {loop_not_taken_12362 : bool} = not loop_nonempty_12361 let {protect_assert_disj_12363 : bool} = logor(index_ok_12191, loop_not_taken_12362) let {index_certs_12364 : unit} = assert(protect_assert_disj_12363, {"Index [", 0i64 : i64, ", ", 0i64 : i64, ":] out of bounds for array of shape [", bp1_12177 : i64, "][", bp1_12177 : i64, "]."}, "tests/slice-lmads/nw-cosmin.fut:59:3-101:31") let {protect_assert_disj_12365 : bool} = logor(index_ok_12197, loop_not_taken_12362) let {index_certs_12366 : unit} = assert(protect_assert_disj_12365, {"Index [", 1i64 : i64, ":, ", 0i64 : i64, "] out of bounds for array of shape [", bp1_12177 : i64, "][", bp1_12177 : i64, "]."}, "tests/slice-lmads/nw-cosmin.fut:60:3-101:31") let {protect_assert_disj_12367 : bool} = logor(index_ok_12202, loop_not_taken_12362) let {index_certs_12368 : unit} = assert(protect_assert_disj_12367, {"Index [", 1i64 : i64, ":, ", 1i64 : i64, ":] out of bounds for array of shape [", bp1_12177 : i64, "][", bp1_12177 : i64, "]."}, "tests/slice-lmads/nw-cosmin.fut:101:6-18") let {input_12369 : [n_12164]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = loop {input_12371 : *[n_12164]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = {input_12207} for i_12370:i64 < upper_bound_12360 do { let {x_12372 : i64} = sub64(num_blocks_12176, i_12370) let {flat_index_3d_arg_12373 : i64} = sub64(x_12372, 1i64) let {x_12374 : i64} = add64(1i64, i_12370) let {x_12375 : i64} = mul64(block_size_12165, x_12374) let {x_12376 : i64} = add64(2i64, x_12375) let {x_12377 : i64} = mul64(defunc_0_f_res_12171, x_12376) let {flat_index_3d_arg_12378 : i64} = sub64(x_12377, block_size_12165) let {offset_inbounds_down_12379 : bool} = ule64(0i64, flat_index_3d_arg_12378) let {offset_inbounds_up_12380 : bool} = ult64(flat_index_3d_arg_12378, n_12164) let {span_12381 : i64} = sub_nw64(flat_index_3d_arg_12373, 1i64) let {span_12382 : i64} = mul_nw64(flat_index_3d_arg_12179, span_12381) let {span_and_lower_12383 : i64} = add_nw64(flat_index_3d_arg_12378, span_12382) let {minimum_12384 : i64} = umin64(flat_index_3d_arg_12378, span_and_lower_12383) let {maximum_12385 : i64} = umax64(flat_index_3d_arg_12378, span_and_lower_12383) let {span_and_lower_12386 : i64} = add_nw64(span_12205, minimum_12384) let {span_and_upper_12387 : i64} = add_nw64(span_12205, maximum_12385) let {minimum_12388 : i64} = umin64(minimum_12384, span_and_lower_12386) let {maximum_12389 : i64} = umax64(maximum_12385, span_and_upper_12387) let {span_and_lower_12390 : i64} = add_nw64(upper_bound_12200, minimum_12388) let {span_and_upper_12391 : i64} = add_nw64(upper_bound_12200, maximum_12389) let {minimum_12392 : i64} = umin64(minimum_12388, span_and_lower_12390) let {maximum_12393 : i64} = umax64(maximum_12389, span_and_upper_12391) let {min_in_bounds_12394 : bool} = ule64(0i64, minimum_12392) let {max_in_bounds_12395 : bool} = ult64(maximum_12393, n_12164) let {inBounds_12396 : bool} = logand(offset_inbounds_down_12379, offset_inbounds_up_12380) let {inBounds_12397 : bool} = logand(min_in_bounds_12394, inBounds_12396) let {inBounds_12398 : bool} = logand(max_in_bounds_12395, inBounds_12397) let {bounds_cert_12399 : unit} = assert(inBounds_12398, {"Flat slice out of bounds: n_4488 and [(n1_4492, s1_4493), (n2_4494, s2_4495), (n3_4496, s3_4497)]"}, "tests/slice-lmads/intrinsics.fut:8:3-62") let {flat_index_3d_res_12400 : [flat_index_3d_arg_12373][block_size_12165][block_size_12165]i32 @ refs_mem_12854 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: flat_index_3d_arg_12378; strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171, 1i64]; rotates: [0i64, 0i64, 0i64]; shape: [flat_index_3d_arg_12373, block_size_12165, block_size_12165]; permutation: [0, 1, 2]; monotonicity: [Unknown, Unknown, Unknown]}]}} = #{bounds_cert_12399} refs_12168[flat_index_3d_arg_12378; flat_index_3d_arg_12373 : flat_index_3d_arg_12179, block_size_12165 : defunc_0_f_res_12171, block_size_12165 : 1i64] let {x_12401 : i64} = add64(1i64, x_12375) let {x_12402 : i64} = mul64(defunc_0_f_res_12171, x_12401) let {x_12403 : i64} = sub64(x_12402, block_size_12165) let {x_12404 : i64} = sub64(x_12403, 1i64) let {flat_index_2d_arg_12405 : i64} = add64(defunc_0_f_res_12171, x_12404) let {offset_inbounds_down_12406 : bool} = ule64(0i64, flat_index_2d_arg_12405) let {offset_inbounds_up_12407 : bool} = ult64(flat_index_2d_arg_12405, n_12164) let {span_and_lower_12408 : i64} = add_nw64(span_12382, flat_index_2d_arg_12405) let {minimum_12409 : i64} = umin64(flat_index_2d_arg_12405, span_and_lower_12408) let {maximum_12410 : i64} = umax64(flat_index_2d_arg_12405, span_and_lower_12408) let {span_and_lower_12411 : i64} = add_nw64(span_12205, minimum_12409) let {span_and_upper_12412 : i64} = add_nw64(span_12205, maximum_12410) let {minimum_12413 : i64} = umin64(minimum_12409, span_and_lower_12411) let {maximum_12414 : i64} = umax64(maximum_12410, span_and_upper_12412) let {min_in_bounds_12415 : bool} = ule64(0i64, minimum_12413) let {max_in_bounds_12416 : bool} = ult64(maximum_12414, n_12164) let {inBounds_12417 : bool} = logand(offset_inbounds_down_12406, offset_inbounds_up_12407) let {inBounds_12418 : bool} = logand(min_in_bounds_12415, inBounds_12417) let {inBounds_12419 : bool} = logand(max_in_bounds_12416, inBounds_12418) let {bounds_cert_12420 : unit} = assert(inBounds_12419, {"Flat slice out of bounds: n_4457 and [(n1_4461, s1_4462), (n2_4463, s2_4464)]"}, "tests/slice-lmads/intrinsics.fut:2:3-54") let {flat_index_2d_res_12421 : [flat_index_3d_arg_12373][block_size_12165]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: flat_index_2d_arg_12405; strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171]; rotates: [0i64, 0i64]; shape: [flat_index_3d_arg_12373, block_size_12165]; permutation: [0, 1]; monotonicity: [Unknown, Unknown]}]}} = #{bounds_cert_12420} input_12371[flat_index_2d_arg_12405; flat_index_3d_arg_12373 : flat_index_3d_arg_12179, block_size_12165 : defunc_0_f_res_12171] let {offset_inbounds_down_12422 : bool} = ule64(0i64, x_12404) let {offset_inbounds_up_12423 : bool} = ult64(x_12404, n_12164) let {span_and_lower_12424 : i64} = add_nw64(span_12382, x_12404) let {minimum_12425 : i64} = umin64(x_12404, span_and_lower_12424) let {maximum_12426 : i64} = umax64(x_12404, span_and_lower_12424) let {span_and_lower_12427 : i64} = add_nw64(block_size_12165, minimum_12425) let {span_and_upper_12428 : i64} = add_nw64(block_size_12165, maximum_12426) let {minimum_12429 : i64} = umin64(minimum_12425, span_and_lower_12427) let {maximum_12430 : i64} = umax64(maximum_12426, span_and_upper_12428) let {min_in_bounds_12431 : bool} = ule64(0i64, minimum_12429) let {max_in_bounds_12432 : bool} = ult64(maximum_12430, n_12164) let {inBounds_12433 : bool} = logand(offset_inbounds_down_12422, offset_inbounds_up_12423) let {inBounds_12434 : bool} = logand(min_in_bounds_12431, inBounds_12433) let {inBounds_12435 : bool} = logand(max_in_bounds_12432, inBounds_12434) let {bounds_cert_12436 : unit} = assert(inBounds_12435, {"Flat slice out of bounds: n_4457 and [(n1_4461, s1_4462), (n2_4463, s2_4464)]"}, "tests/slice-lmads/intrinsics.fut:2:3-54") let {flat_index_2d_res_12437 : [flat_index_3d_arg_12373][bp1_12177]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: x_12404; strides: [flat_index_3d_arg_12179, 1i64]; rotates: [0i64, 0i64]; shape: [flat_index_3d_arg_12373, bp1_12177]; permutation: [0, 1]; monotonicity: [Unknown, Unknown]}]}} = #{bounds_cert_12436} input_12371[x_12404; flat_index_3d_arg_12373 : flat_index_3d_arg_12179, bp1_12177 : 1i64] let {binop_x_12975 : i64} = #[incremental_flattening(only_intra)] mul_nw64(block_size_12165, flat_index_3d_arg_12373) let {binop_x_12976 : i64} = #[incremental_flattening(only_intra)] mul_nw64(block_size_12165, binop_x_12975) let {binop_y_12977 : i64} = #[incremental_flattening(only_intra)] mul_nw64(4i64, binop_x_12976) let {bytes_12978 : i64} = #[incremental_flattening(only_intra)] smax64(0i64, binop_y_12977) let {mem_12979 : mem} = #[incremental_flattening(only_intra)] alloc(bytes_12978) let {defunc_5_map_res_12745 : [flat_index_3d_arg_12373][block_size_12165][block_size_12165]i32 @ input_mem_12853 -> {base: [flat_index_3d_arg_12373, block_size_12165, block_size_12165]; contiguous: true; LMADs: [{offset: flat_index_3d_arg_12378; strides: [flat_index_3d_arg_12179, defunc_0_f_res_12171, 1i64]; rotates: [0i64, 0i64, 0i64]; shape: [flat_index_3d_arg_12373, block_size_12165, block_size_12165]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}} = #[incremental_flattening(only_intra)] segmap(group; #groups=flat_index_3d_arg_12373; groupsize=block_size_12165) (gtid_12684 < flat_index_3d_arg_12373) (~phys_tid_12744) : {[block_size_12165][block_size_12165]i32} { let {color_12999 : mem@local} = alloc(bytes_12879, @local) let {color_13000 : mem@local} = alloc(bytes_12865, @local) let {x_12746 : [bp1_12177]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: add_nw64 (x_12404) (mul_nw64 (gtid_12684) (flat_index_3d_arg_12179)); strides: [1i64]; rotates: [0i64]; shape: [bp1_12177]; permutation: [0]; monotonicity: [Unknown]}]}} = flat_index_2d_res_12437[gtid_12684, 0i64 :+ bp1_12177 * 1i64] let {x_12748 : [block_size_12165]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: add_nw64 (flat_index_2d_arg_12405) (mul_nw64 (gtid_12684) (flat_index_3d_arg_12179)); strides: [defunc_0_f_res_12171]; rotates: [0i64]; shape: [block_size_12165]; permutation: [0]; monotonicity: [Unknown]}]}} = flat_index_2d_res_12421[gtid_12684, 0i64 :+ block_size_12165 * 1i64] let {defunc_2_map_res_12752 : [bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = replicate([bp1_12177][bp1_12177], 0i32) let {block_12753 : [bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = #{index_certs_12364} defunc_2_map_res_12752 with [0i64, 0i64 :+ bp1_12177 * 1i64] = x_12746 let {block_12754 : [bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = #{index_certs_12366} block_12753 with [1i64 :+ block_size_12165 * 1i64, 0i64] = x_12748 let {block_12755 : [bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = loop {block_12757 : *[bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = {block_12754} for m_12756:i64 < block_size_12165 do { let {defunc_1_map_res_12758 : [block_size_12165]i32 @ color_12999 -> {base: [block_size_12165]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [block_size_12165]; permutation: [0]; monotonicity: [Inc]}]}} = segmap(thread; #groups=flat_index_3d_arg_12373; groupsize=block_size_12165) (gtid_12687 < block_size_12165) (~phys_tid_12688) : {i32} { let {cond_12760 : bool} = slt64(m_12756, gtid_12687) let {defunc_0_f_res_12761 : i32} = if cond_12760 then {0i32} else { let {i64_arg_12762 : i64} = add64(1i64, gtid_12687) let {i64_res_12763 : i32} = sext i64 i64_arg_12762 to i32 let {x_12764 : i64} = sub64(m_12756, gtid_12687) let {i64_arg_12765 : i64} = add64(1i64, x_12764) let {i64_res_12766 : i32} = sext i64 i64_arg_12765 to i32 let {i_12767 : i32} = sub32(i64_res_12766, 1i32) let {i_12768 : i64} = sext i32 i_12767 to i64 let {x_12769 : i64} = sext i32 i64_res_12763 to i64 let {x_12770 : i32} = block_12757[i_12768, x_12769] let {max_arg_12771 : i32} = sub32(x_12770, penalty_12166) let {y_12772 : i64} = sext i32 i64_res_12766 to i64 let {i_12773 : i32} = sub32(i64_res_12763, 1i32) let {i_12774 : i64} = sext i32 i_12773 to i64 let {x_12775 : i32} = block_12757[y_12772, i_12774] let {max_arg_12776 : i32} = sub32(x_12775, penalty_12166) let {max_res_12777 : i32} = smax32(max_arg_12771, max_arg_12776) let {x_12778 : i32} = block_12757[i_12768, i_12774] let {y_12779 : i32} = flat_index_3d_res_12400[gtid_12684, i_12768, i_12774] let {max_arg_12780 : i32} = add32(x_12778, y_12779) let {defunc_0_f_res_12781 : i32} = smax32(max_res_12777, max_arg_12780) in {defunc_0_f_res_12781} } : {i32} return {returns defunc_0_f_res_12761} } let {scatter_2d_res_12782 : [bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = segmap(thread; #groups=flat_index_3d_arg_12373; groupsize=block_size_12165) (write_i_12712 < block_size_12165) (~phys_tid_12713) : {i32} { let {write_value_12784 : i32} = defunc_1_map_res_12758[write_i_12712] let {cond_12785 : bool} = slt64(m_12756, write_i_12712) let {defunc_0_f_res_12786 : i64, defunc_0_f_res_12787 : i64} = if cond_12785 then {-1i64, -1i64} else { let {i64_arg_12788 : i64} = add64(1i64, write_i_12712) let {i64_res_12789 : i32} = sext i64 i64_arg_12788 to i32 let {x_12790 : i64} = sub64(m_12756, write_i_12712) let {i64_arg_12791 : i64} = add64(1i64, x_12790) let {i64_res_12792 : i32} = sext i64 i64_arg_12791 to i32 let {i32_res_12793 : i64} = sext i32 i64_res_12792 to i64 let {i32_res_12794 : i64} = sext i32 i64_res_12789 to i64 in {i32_res_12793, i32_res_12794} } : {i64, i64} return {block_12757 : [bp1_12177][bp1_12177] with ([defunc_0_f_res_12786, defunc_0_f_res_12787] = write_value_12784)} } in {scatter_2d_res_12782} } let {block_12795 : [bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = loop {block_12797 : *[bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = {block_12755} for m_12796:i64 < upper_bound_12200 do { let {m_12798 : i64} = sub64(x_12201, m_12796) let {defunc_1_map_res_12799 : [block_size_12165]i32 @ color_12999 -> {base: [block_size_12165]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [block_size_12165]; permutation: [0]; monotonicity: [Inc]}]}} = segmap(thread; #groups=flat_index_3d_arg_12373; groupsize=block_size_12165) (gtid_12714 < block_size_12165) (~phys_tid_12715) : {i32} { let {cond_12801 : bool} = slt64(m_12798, gtid_12714) let {defunc_0_f_res_12802 : i32} = if cond_12801 then {0i32} else { let {x_12803 : i64} = add64(block_size_12165, gtid_12714) let {i64_arg_12804 : i64} = sub64(x_12803, m_12798) let {i64_res_12805 : i32} = sext i64 i64_arg_12804 to i32 let {i64_arg_12806 : i64} = sub64(block_size_12165, gtid_12714) let {i64_res_12807 : i32} = sext i64 i64_arg_12806 to i32 let {i_12808 : i32} = sub32(i64_res_12807, 1i32) let {i_12809 : i64} = sext i32 i_12808 to i64 let {x_12810 : i64} = sext i32 i64_res_12805 to i64 let {x_12811 : i32} = block_12797[i_12809, x_12810] let {max_arg_12812 : i32} = sub32(x_12811, penalty_12166) let {y_12813 : i64} = sext i32 i64_res_12807 to i64 let {i_12814 : i32} = sub32(i64_res_12805, 1i32) let {i_12815 : i64} = sext i32 i_12814 to i64 let {x_12816 : i32} = block_12797[y_12813, i_12815] let {max_arg_12817 : i32} = sub32(x_12816, penalty_12166) let {max_res_12818 : i32} = smax32(max_arg_12812, max_arg_12817) let {x_12819 : i32} = block_12797[i_12809, i_12815] let {y_12820 : i32} = flat_index_3d_res_12400[gtid_12684, i_12809, i_12815] let {max_arg_12821 : i32} = add32(x_12819, y_12820) let {defunc_0_f_res_12822 : i32} = smax32(max_res_12818, max_arg_12821) in {defunc_0_f_res_12822} } : {i32} return {returns defunc_0_f_res_12802} } let {scatter_2d_res_12823 : [bp1_12177][bp1_12177]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: true; LMADs: [{offset: 0i64; strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [bp1_12177, bp1_12177]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = segmap(thread; #groups=flat_index_3d_arg_12373; groupsize=block_size_12165) (write_i_12739 < block_size_12165) (~phys_tid_12740) : {i32} { let {write_value_12825 : i32} = defunc_1_map_res_12799[write_i_12739] let {cond_12826 : bool} = slt64(m_12798, write_i_12739) let {defunc_0_f_res_12827 : i64, defunc_0_f_res_12828 : i64} = if cond_12826 then {-1i64, -1i64} else { let {x_12829 : i64} = add64(block_size_12165, write_i_12739) let {i64_arg_12830 : i64} = sub64(x_12829, m_12798) let {i64_res_12831 : i32} = sext i64 i64_arg_12830 to i32 let {i64_arg_12832 : i64} = sub64(block_size_12165, write_i_12739) let {i64_res_12833 : i32} = sext i64 i64_arg_12832 to i32 let {i32_res_12834 : i64} = sext i32 i64_res_12833 to i64 let {i32_res_12835 : i64} = sext i32 i64_res_12831 to i64 in {i32_res_12834, i32_res_12835} } : {i64, i64} return {block_12797 : [bp1_12177][bp1_12177] with ([defunc_0_f_res_12827, defunc_0_f_res_12828] = write_value_12825)} } in {scatter_2d_res_12823} } let {defunc_2_f_res_12836 : [block_size_12165][block_size_12165]i32 @ color_13000 -> {base: [bp1_12177, bp1_12177]; contiguous: false; LMADs: [{offset: add_nw64 (bp1_12177) (1i64); strides: [bp1_12177, 1i64]; rotates: [0i64, 0i64]; shape: [block_size_12165, block_size_12165]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}} = #{index_certs_12368} block_12795[1i64 :+ block_size_12165 * 1i64, 1i64 :+ block_size_12165 * 1i64] return {returns defunc_2_f_res_12836} } let {bounds_cert_12529 : unit} = assert(inBounds_12398, {"Flat slice out of bounds: n_4508 and [(k_4509, s1_4515), (l_4510, s2_4516), (p_4511, s3_4517)]"}, "tests/slice-lmads/intrinsics.fut:11:3-57") let {flat_update_3d_res_12530 : [n_12164]i32 @ input_mem_12853 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = #{bounds_cert_12529} input_12371 with [flat_index_3d_arg_12378; flat_index_3d_arg_12373 : flat_index_3d_arg_12179, block_size_12165 : defunc_0_f_res_12171, block_size_12165 : 1i64] = defunc_5_map_res_12745 in {flat_update_3d_res_12530} } let {binop_y_12985 : i64} = mul_nw64(4i64, n_12164) let {bytes_12986 : i64} = smax64(0i64, binop_y_12985) let {mem_12987 : mem} = alloc(bytes_12986) let {input_linear_12988 : [n_12164]i32 @ mem_12987 -> {base: [n_12164]; contiguous: true; LMADs: [{offset: 0i64; strides: [1i64]; rotates: [0i64]; shape: [n_12164]; permutation: [0]; monotonicity: [Inc]}]}} = copy(input_12369) in {mem_12987, input_linear_12988} } ```

Once all of this is in place however, the ImpGen code will still insert the copy corresponding to

      let {flat_update_3d_res_12359 : [n_12164]i32 @ input_mem_12853 ->
                                      {base: [n_12164]; contiguous: true;
                                       LMADs: [{offset: 0i64; strides: [1i64];
                                                rotates: [0i64];
                                                shape: [n_12164];
                                                permutation: [0];
                                                monotonicity: [Inc]}]}} =
        #{bounds_cert_12358}
        input_12209 with [flat_index_3d_arg_12212; ip1_12210 : flat_index_3d_arg_12179,
                                                   block_size_12165 : defunc_0_f_res_12171,
                                                   block_size_12165 : 1i64] = defunc_5_map_res_12592

which has no effect because the source and destination are identical. This means that, in order to take advantage of our memory coalescing in cases like this, we need ImpGen to identify when a copy has identical source and destination, and then treat it as a no-op.

Note that we cannot simply remove the flat_update, because then the result of the segmap is never used and will be removed by the simplifier. It needs to remain, but not generate any code in the end.

athas commented 3 years ago

There is some code in ImpGen that is supposed to detect this, but I think it's too sensitive. I think it's fairly robust to detect statically whether the source and target memory blocks are the same, and the problem is that the slice might not be exactly (statically) the same on both sides. In some cases it might be tricky to determine, too. So maybe we can generate a conditional that, at run-time, checks whether the slices are identical, and if so elides the copy?

Munksgaard commented 3 years ago

There is some code in ImpGen that is supposed to detect this, but I think it's too sensitive. I think it's fairly robust to detect statically whether the source and target memory blocks are the same, and the problem is that the slice might not be exactly (statically) the same on both sides. In some cases it might be tricky to determine, too. So maybe we can generate a conditional that, at run-time, checks whether the slices are identical, and if so elides the copy?

I can't tell for sure, but when I run futhark dev -e --backend=opencl --server nw-cosmin.fut_gpu_mem, that check doesn't seem to run. Here are the destlocation' and srclocation' values being checked:

destlocation': MemLoc {memLocName = VName (Name "color") 12998, memLocShape = [Var (VName (Name "bp1") 12177),Var (VName (Name "bp1") 12177)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldPerm = 0, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}], ixfunContig = True}}
srclocation': MemLoc {memLocName = VName (Name "input_mem") 12853, memLocShape = [Var (VName (Name "bp1") 12177)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "y") 12211)) (IntType Int64)) (BinOpExp (Mul Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "gtid") 12531)) (IntType Int64)) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12179)) (IntType Int64)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldPerm = 0, ldMon = Unknown}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}], ixfunContig = True}}
destlocation': MemLoc {memLocName = VName (Name "color") 12998, memLocShape = [Var (VName (Name "bp1") 12177),Var (VName (Name "bp1") 12177)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}], ixfunContig = False}}
srclocation': MemLoc {memLocName = VName (Name "input_mem") 12853, memLocShape = [Var (VName (Name "block_size") 12165)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "flat_index_2d_arg") 12234)) (IntType Int64)) (BinOpExp (Mul Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "gtid") 12531)) (IntType Int64)) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12179)) (IntType Int64)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "defunc_0_f_res") 12171)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Unknown}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}], ixfunContig = True}}
destlocation': MemLoc {memLocName = VName (Name "input_mem") 12853, memLocShape = [Var (VName (Name "ip1") 12210),Var (VName (Name "block_size") 12165),Var (VName (Name "block_size") 12165)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12212)) (IntType Int64)) (BinOpExp (Mul Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "gtid") 12531)) (IntType Int64)) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12179)) (IntType Int64)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "defunc_0_f_res") 12171)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Inc},LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 1, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "ip1") 12210)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}], ixfunContig = True}}
srclocation': MemLoc {memLocName = VName (Name "color") 12998, memLocShape = [Var (VName (Name "block_size") 12165),Var (VName (Name "block_size") 12165)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)) (ValueExp (IntValue (Int64Value 1)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Inc},LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 1, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}], ixfunContig = False}}
destlocation': MemLoc {memLocName = VName (Name "color") 13000, memLocShape = [Var (VName (Name "bp1") 12177),Var (VName (Name "bp1") 12177)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldPerm = 0, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}], ixfunContig = True}}
srclocation': MemLoc {memLocName = VName (Name "input_mem") 12853, memLocShape = [Var (VName (Name "bp1") 12177)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "x") 12404)) (IntType Int64)) (BinOpExp (Mul Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "gtid") 12684)) (IntType Int64)) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12179)) (IntType Int64)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldPerm = 0, ldMon = Unknown}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}], ixfunContig = True}}
destlocation': MemLoc {memLocName = VName (Name "color") 13000, memLocShape = [Var (VName (Name "bp1") 12177),Var (VName (Name "bp1") 12177)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}], ixfunContig = False}}
srclocation': MemLoc {memLocName = VName (Name "input_mem") 12853, memLocShape = [Var (VName (Name "block_size") 12165)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "flat_index_2d_arg") 12405)) (IntType Int64)) (BinOpExp (Mul Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "gtid") 12684)) (IntType Int64)) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12179)) (IntType Int64)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "defunc_0_f_res") 12171)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Unknown}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}], ixfunContig = True}}
destlocation': MemLoc {memLocName = VName (Name "input_mem") 12853, memLocShape = [Var (VName (Name "flat_index_3d_arg") 12373),Var (VName (Name "block_size") 12165),Var (VName (Name "block_size") 12165)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12378)) (IntType Int64)) (BinOpExp (Mul Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "gtid") 12684)) (IntType Int64)) (LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12179)) (IntType Int64)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "defunc_0_f_res") 12171)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Inc},LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 1, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "flat_index_3d_arg") 12373)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}], ixfunContig = True}}
srclocation': MemLoc {memLocName = VName (Name "color") 13000, memLocShape = [Var (VName (Name "block_size") 12165),Var (VName (Name "block_size") 12165)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = BinOpExp (Add Int64 OverflowUndef) (LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)) (ValueExp (IntValue (Int64Value 1)))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 0, ldMon = Inc},LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "block_size") 12165)) (IntType Int64)}, ldPerm = 1, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)},TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "bp1") 12177)) (IntType Int64)}], ixfunContig = False}}
destlocation': MemLoc {memLocName = VName (Name "mem") 12987, memLocShape = [Var (VName (Name "n") 12164)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}, ldPerm = 0, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}], ixfunContig = True}}
srclocation': MemLoc {memLocName = VName (Name "input_mem") 12853, memLocShape = [Var (VName (Name "n") 12164)], memLocIxFun = IxFun {ixfunLMADs = LMAD {lmadOffset = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, lmadDims = [LMADDim {ldStride = TPrimExp {untyped = ValueExp (IntValue (Int64Value 1))}, ldRotate = TPrimExp {untyped = ValueExp (IntValue (Int64Value 0))}, ldShape = TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}, ldPerm = 0, ldMon = Inc}]} :| [], base = [TPrimExp {untyped = LeafExp (ScalarVar (VName (Name "n") 12164)) (IntType Int64)}], ixfunContig = True}}

Unless I'm missing something, there should be a case where there is a destlocation' followed by a srclocation' that both refer to input_mem_12853.