taco: xorOp higher-order tensor code and Merge Lattice generation bug

weiya711 commented 3 years ago

Currently the taco/array_algebra branch generates incorrect xorOp lowering code for 2+ order tensors (found by @rohany)

The lowering code seems to only compare for the iteration algebra on the highest level of coordinates and does not check the case where input1_crd1 == input2_crd1 but input1_crd2 != input2_crd2.

Created a branch in taco/array_algebra_xor_fix that:

adds in a new test called XorOpOrder2 in test/tests-lower.cpp to test the 2-dimensional C(i, j) = xorOp(A(i, j), B(i, j))
Applies a patch from @rohany in taco/lower/lowerer-impl.cpp in lowerMergeCases(...) that should fix this bug

rohany commented 3 years ago

@RawnH do you have suggestions for this bug? A the problem can be seen in the generated code for a 2-d element wise xor op:

int compute(taco_tensor_t *z, taco_tensor_t *y, taco_tensor_t *x) {
  int32_t* restrict z_vals = (int32_t*)(z->vals);
  int* restrict y1_pos = (int*)(y->indices[0][0]);
  int* restrict y1_crd = (int*)(y->indices[0][1]);
  int* restrict y2_pos = (int*)(y->indices[1][0]);
  int* restrict y2_crd = (int*)(y->indices[1][1]);
  int32_t* restrict y_vals = (int32_t*)(y->vals);
  int* restrict x1_pos = (int*)(x->indices[0][0]);
  int* restrict x1_crd = (int*)(x->indices[0][1]);
  int* restrict x2_pos = (int*)(x->indices[1][0]);
  int* restrict x2_crd = (int*)(x->indices[1][1]);
  int32_t* restrict x_vals = (int32_t*)(x->vals);
  int32_t i1504z = 0;
  int32_t i1503y = y1_pos[0];
  int32_t py1_end = y1_pos[1];
  int32_t i1503x = x1_pos[0];
  int32_t px1_end = x1_pos[1];
  while (i1503y < py1_end && i1503x < px1_end) {
    int32_t i1503y0 = y1_crd[i1503y];
    int32_t i1503x0 = x1_crd[i1503x];
    int32_t i1503 = TACO_MIN(i1503y0,i1503x0);
    if (i1503y0 == i1503 && i1503x0 != i1503) {
      for (int32_t i1504y = y2_pos[i1503y]; i1504y < y2_pos[(i1503y + 1)]; i1504y++) {
        z_vals[i1504z] = y_vals[i1504y];
        i1504z++;
      }
    }
    else if (i1503x0 == i1503 && i1503y0 != i1503) {
      for (int32_t i1504x = x2_pos[i1503x]; i1504x < x2_pos[(i1503x + 1)]; i1504x++) {
        z_vals[i1504z] = x_vals[i1504x];
        i1504z++;
      }
    }
    i1503y += (int32_t)(i1503y0 == i1503);
    i1503x += (int32_t)(i1503x0 == i1503);
  }
  while (i1503y < py1_end) {
    for (int32_t i1504y = y2_pos[i1503y]; i1504y < y2_pos[(i1503y + 1)]; i1504y++) {
      z_vals[i1504z] = y_vals[i1504y];
      i1504z++;
    }
    i1503y++;
  }
  while (i1503x < px1_end) {
    for (int32_t i1504x = x2_pos[i1503x]; i1504x < x2_pos[(i1503x + 1)]; i1504x++) {
      z_vals[i1504z] = x_vals[i1504x];
      i1504z++;
    }
    i1503x++;
  }
  return 0;
}

In particular, the code doesn't generate a case for the omitter point when the value on the first mode is equal to the value on the of the first mode of the other tensor. Instead, it needs to emit the case there, and recurse down into the point.

rawnhenry commented 3 years ago

I will look at this when I look at the other issue.

tensor-compiler / array-programming-benchmarks

taco: xorOp higher-order tensor code and Merge Lattice generation bug #9