reuse_at generates inconsistent index access for streaming channel

An example: we have a receiver function that needs to read data sequentially from the streaming channel, and a sender function, as shown in the following coding block, to write data (i.e., calc_x_gradient.grad_x) into the channel.

We can apply data reuse schedule on the sender function to exploit the data locality, but this will lead to index access inconsistency between sender and receiver side (i.e., reader reads data[x + y*1024] and writer writes data[x + y*1024 -2]). To avoid the incorrectness introduced by index inconsistency, the streaming inference IR pass generates another nested loops to write data into the channel (i.e. c_buf_2.write(calc_x_gradient.grad_x[(buf_1 + (buf_0*1024))]) ). And this approach will lead to performance degradation.

def calc_x_gradient(handle64(calc_x_gradient.input_image[436*1024]), handle64(calc_x_gradient.grad_x[436*1024])) {
  for (y, 0, 436) {
    for (x.reuse, 0, 1024) {
      produce calc_x_gradient.input_image.reuse {
        for (calc_x_gradient.input_image.0, 0, 4) {
          calc_x_gradient.input_image.reuse[calc_x_gradient.input_image.0] = calc_x_gradient.input_image.reuse[(calc_x_gradient.input_image.0 + 1)]
        }
        calc_x_gradient.input_image.reuse[4] = calc_x_gradient.input_image[(x.reuse + (y*1024))]
      }
      if ((4 <= x.reuse)) {
        allocate reducer0[float32 * 1]
        reducer0[0] = 0.000000f
        for (rdx, 0, 5) {
          reducer0[0] = ((calc_x_gradient.input_image.reuse[rdx]*float32(g_w[rdx])) + reducer0[0])
        }
        calc_x_gradient.grad_x[((x.reuse + (y*1024)) + -2)] = reducer0[0]
      }
    }
  }
  pipelined (buf_1, 0, 1024) {
    for (buf_0, 0, 436) {
      c_buf_2.write(calc_x_gradient.grad_x[(buf_1 + (buf_0*1024))])
    }
  }
}

As I discussed with Sean, a simple solution is to add extra if-else statement around the condition block to maintain the index access order consistency of receiver and sender.

def calc_x_gradient(handle64(calc_x_gradient.input_image[436*1024]), handle64(calc_x_gradient.grad_x[436*1024])) {
  for (y, 0, 436) {
    for (x.reuse, 0, 1026) {
      produce calc_x_gradient.input_image.reuse {
        for (calc_x_gradient.input_image.0, 0, 4) {
          calc_x_gradient.input_image.reuse[calc_x_gradient.input_image.0] = calc_x_gradient.input_image.reuse[(calc_x_gradient.input_image.0 + 1)]
        }
        calc_x_gradient.input_image.reuse[4] = calc_x_gradient.input_image[(x.reuse + (y*1024))]
      }

      if (2 < x.reuse) {
        c_buf_2.write(0); 
      } elif ((4 <= x.reuse)) {
        allocate reducer0[float32 * 1]
        reducer0[0] = 0.000000f
        for (rdx, 0, 5) {
          reducer0[0] = ((calc_x_gradient.input_image.reuse[rdx]*float32(g_w[rdx])) + reducer0[0])
        }
        c_buf_2.write(reducer0[0]);
      } elif (x.reuse > 1024) {
         c_buf_2.write(0); 
      }
    }
  }
}

cornell-zhang / heterocl

reuse_at generates inconsistent index access for streaming channel #161