diku-dk / futhark

:boom::computer::boom: A data-parallel functional programming language
http://futhark-lang.org
ISC License
2.38k stars 165 forks source link

Invalid short circuiting #2084

Open athas opened 8 months ago

athas commented 8 months ago

Suppose we have a program like the following:

types {

}

entry("main",
      {as: [][][]f32},
      {*[][][]f32})
  entry_main (as_mem_5569 : mem@device,
              n₁_5381 : i64,
              n₃_5382 : i64,
              m₄_5383 : i64,
              as_5384 : [n₁_5381][n₃_5382][m₄_5383]f32 @ as_mem_5569 ->
                        {offset: 0i64;
                         strides: [mul_nw64 (m₄_5383) (n₃_5382), m₄_5383, 1i64];
                         shape: [n₁_5381, n₃_5382, m₄_5383]})
  : {mem@device,
     *[n₁_5381][n₃_5382][m₄_5383]f32 @ ?0@device ->
     {offset: 0i64;
      strides: [mul_nw64 (m₄_5383) (n₃_5382), m₄_5383, 1i64];
      shape: [n₁_5381, n₃_5382, m₄_5383]}} = {
  let {j_m_i_5505 : i64} =
    sub64(n₃_5382, 1i64)
  let {empty_slice_5506 : bool} =
    eq_i64(j_m_i_5505, 0i64)
  let {zero_leq_i_p_m_t_s_5507 : bool} =
    sle64(0i64, j_m_i_5505)
  let {i_lte_j_5508 : bool} =
    sle64(1i64, n₃_5382)
  let {loop_nonempty_5509 : bool} =
    slt64(0i64, n₃_5382)
  let {loop_not_taken_5510 : bool} =
    not loop_nonempty_5509
  let {i_p_m_t_s_leq_w_5513 : bool} =
    slt64(j_m_i_5505, n₃_5382)
  let {y_5514 : bool} =
    logand(zero_leq_i_p_m_t_s_5507, i_p_m_t_s_leq_w_5513)
  let {forwards_ok_5515 : bool} =
    logand(i_lte_j_5508, y_5514)
  let {ok_or_empty_5516 : bool} =
    logor(empty_slice_5506, forwards_ok_5515)
  let {protect_assert_disj_5517 : bool} =
    logor(loop_not_taken_5510, ok_or_empty_5516)
  let {index_certs_5518 : unit} =
    assert(protect_assert_disj_5517, {"Index [", 1i64 : i64, ":", n₃_5382 : i64, "] out of bounds for array of shape [", n₃_5382 : i64, "]."}, "tests/issue811.fut:7:13-19")
  let {binop_y_5570 : i64} =
    sub_nw64(n₁_5381, 1i64)
  let {binop_x_5572 : i64} =
    smax64(0i64, binop_y_5570)
  let {binop_x_5574 : i64} =
    smax64(0i64, j_m_i_5505)
  let {binop_y_5575 : i64} =
    mul_nw64(n₁_5381, m₄_5383)
  let {binop_y_5576 : i64} =
    mul_nw64(binop_x_5574, binop_y_5575)
  let {binop_y_5577 : i64} =
    smax64(0i64, binop_y_5576)
  let {binop_x_5578 : i64} =
    add_nw64(binop_x_5572, binop_y_5577)
  let {binop_y_5579 : i64} =
    sub_nw64(m₄_5383, 1i64)
  let {binop_x_5580 : i64} =
    smax64(0i64, binop_y_5579)
  let {binop_y_5581 : i64} =
    mul_nw64(n₁_5381, binop_x_5580)
  let {binop_y_5582 : i64} =
    smax64(0i64, binop_y_5581)
  let {binop_y_5583 : i64} =
    add_nw64(binop_x_5578, binop_y_5582)
  let {binop_y_5584 : i64} =
    add_nw64(1i64, binop_y_5583)
  let {bytes_5585 : i64} =
    mul_nw64(4i64, binop_y_5584)
  let {mem_5586 : mem@device} =
    alloc(bytes_5585, @device)
  let {mem_5620 : mem@device} =
    alloc(bytes_5585, @device)
  let {binop_x_5621 : i64} =
    mul_nw64(n₁_5381, n₃_5382)
  let {binop_x_5622 : i64} =
    mul_nw64(m₄_5383, binop_x_5621)
  let {binop_y_5623 : i64} =
    mul_nw64(4i64, binop_x_5622)
  let {bytes_5624 : i64} =
    smax64(0i64, binop_y_5623)
  let {mem_5625 : mem@device} =
    alloc(bytes_5624, @device)
  let {segmap_tblock_size_5554 : i64} =
    get_size(segmap_tblock_size_5538, thread_block_size)
  let {num_tblocks_5555 : i64} =
    calc_num_tblocks(n₁_5381, segmap_num_tblocks_5540, segmap_tblock_size_5554)
  let {as_coalesced_5568 : [n₁_5381][n₃_5382][m₄_5383]f32 @ mem_5586 ->
                           {offset: 0i64;
                            strides: [1i64, mul_nw64 (n₁_5381) (m₄_5383), n₁_5381];
                            shape: [n₁_5381, n₃_5382, m₄_5383]}} =
    manifest((1, 2, 0), as_5384)
  let {ixfun_arg_5592 : i64} =
    mul_nw64(n₁_5381, m₄_5383)
  let {binop_y_5593 : i64} =
    mul_nw64(4i64, m₄_5383)
  let {bytes_5594 : i64} =
    smax64(0i64, binop_y_5593)
  let {binop_x_5596 : i64} =
    mul_nw64(n₃_5382, m₄_5383)
  let {binop_y_5597 : i64} =
    mul_nw64(4i64, binop_x_5596)
  let {bytes_5598 : i64} =
    smax64(0i64, binop_y_5597)
  let {defunc_0_map_res_5557 : [n₁_5381][n₃_5382][m₄_5383]f32 @ mem_5620 ->
                               {offset: 0i64;
                                strides: [1i64, mul_nw64 (n₁_5381) (m₄_5383), n₁_5381];
                                shape: [n₁_5381, n₃_5382, m₄_5383]}} =
    segmap(thread; virtualise; grid=num_tblocks_5555; blocksize=segmap_tblock_size_5554)
    (gtid_5558 < n₁_5381) (~phys_tid_5559) : {[n₃_5382][m₄_5383]f32} {
      let {mem_5595 : mem@device} =
        alloc(bytes_5594, @device)
      let {double_buffer_mem_5629 : mem@device} =
        alloc(bytes_5598, @device)
      let {mem_5599 : mem@device} =
        alloc(bytes_5598, @device)
      let {eta_p_5560 : [n₃_5382][m₄_5383]f32 @ mem_5586 ->
                        {offset: gtid_5558;
                         strides: [mul_nw64 (n₁_5381) (m₄_5383), n₁_5381];
                         shape: [n₃_5382, m₄_5383]}} =
        as_coalesced_5568[gtid_5558, 0i64 :+ n₃_5382 * 1i64, 0i64 :+ m₄_5383 * 1i64]
      let {ext_mem_5603 : mem@device,
           ext_5602 : i64,
           ext_5601 : i64,
           ext_5600 : i64,
           tmp_5561 : [n₃_5382][m₄_5383]f32 @ ext_mem_5603 ->
                      {offset: ext_5602;
                       strides: [ext_5601, ext_5600];
                       shape: [n₃_5382, m₄_5383]}} =
        loop {mem_param_5591 : mem@device,
              ctx_param_ext_5588 : i64,
              ctx_param_ext_5589 : i64,
              ctx_param_ext_5590 : i64,
              A_5563 : [n₃_5382][m₄_5383]f32 @ mem_param_5591 ->
                       {offset: ctx_param_ext_5588;
                        strides: [ctx_param_ext_5589, ctx_param_ext_5590];
                        shape: [n₃_5382, m₄_5383]}} = {mem_5586, gtid_5558, ixfun_arg_5592, n₁_5381, eta_p_5560}
        for _i_5562:i64 < n₃_5382 do {
          let {irow_5564 : [m₄_5383]f32 @ mem_param_5591 ->
                           {offset: ctx_param_ext_5588;
                            strides: [ctx_param_ext_5590];
                            shape: [m₄_5383]}} =
            A_5563[0i64, 0i64 :+ m₄_5383 * 1i64]
          let {Ap_5565 : [j_m_i_5505][m₄_5383]f32 @ mem_param_5591 ->
                         {offset: add_nw64 (ctx_param_ext_5588) (ctx_param_ext_5589);
                          strides: [ctx_param_ext_5589, ctx_param_ext_5590];
                          shape: [j_m_i_5505, m₄_5383]}} =
            #{index_certs_5518}
            A_5563[1i64 :+ j_m_i_5505 * 1i64, 0i64 :+ m₄_5383 * 1i64]
          let {concat_arg_5566 : [1i64][m₄_5383]f32 @ mem_5595 ->
                                 {offset: 0i64;
                                  strides: [m₄_5383, 1i64];
                                  shape: [1i64, m₄_5383]}} =
            replicate([1i64], irow_5564)
          let {++_res_5567 : [n₃_5382][m₄_5383]f32 @ mem_5599 ->
                             {offset: 0i64;
                              strides: [m₄_5383, 1i64];
                              shape: [n₃_5382, m₄_5383]}} =
            concat@0(n₃_5382, Ap_5565, concat_arg_5566)
          let {double_buffer_array_5630 : [n₃_5382][m₄_5383]f32 @ double_buffer_mem_5629 ->
                                          {offset: 0i64;
                                           strides: [m₄_5383, 1i64];
                                           shape: [n₃_5382, m₄_5383]}} =
            copy(++_res_5567)
          in {double_buffer_mem_5629, 0i64, m₄_5383, 1i64, double_buffer_array_5630}
        }
      return {returns tmp_5561}
    }
  let {defunc_0_map_res_desired_form_5626 : [n₁_5381][n₃_5382][m₄_5383]f32 @ mem_5625 ->
                                            {offset: 0i64;
                                             strides: [mul_nw64 (m₄_5383) (n₃_5382), m₄_5383, 1i64];
                                             shape: [n₁_5381, n₃_5382, m₄_5383]}} =
    manifest((0, 1, 2), defunc_0_map_res_5557)
  in {mem_5625, defunc_0_map_res_desired_form_5626}
}

(This is syntactically invalid in master because it makes use of the simplified index functions from #2083.)

Short-circuiting performs a a short circuit of the concat/copy in the loop, storing the result directly in double_buffer_mem_5629. This is invalid, because double_buffer_mem_5629 is still used for live data (Ap_5565, an input to the concat), under the name of mem_param_5591 (a loop parameter).

All the aliases look right to me, so I'm not sure why this fails.

Munksgaard commented 8 months ago

I can take a look, but it won't be until tomorrow evening.

athas commented 8 months ago

Enjoy!

athas commented 8 months ago

This is not an urgent issue, as I rewrote double buffering entirely in a way that does not trigger this bug, but it would still be good not to leave around as a potential problem in the future.