PTX code for async copy of bool type is not correctly generated

Test DistributedTransformerTest.Backward/__bfloat has bool type tensor, when shared memory persistent is used with async copy, it triggers a bug. A mini-reproduce is as follows:

TEST_F(NVFuserTest, CpAsyncDataTypeBool) {
  Fusion fusion;
  FusionGuard fg(&fusion);
  auto dtype = DataType::Bool;
  int m = 33, n = 128;
  auto tv0 = makeContigConcreteTensor({m, n}, dtype);
  fusion.addInput(tv0);
  auto tv1 = set(tv0);
  tv1->setMemoryType(MemoryType::Shared);
  tv1->definition()->as<LoadStoreOp>()->setOpType(
      LoadStoreOpType::CpAsync);
  tv1->definition()->as<LoadStoreOp>()->setCacheOp(CacheOp::Unspecified);
  auto tv2 = castOp(DataType::Int32, tv1);
  fusion.addOutput(tv2);

  for(auto tv : {tv0, tv1, tv2}){
    tv->split(1, 4);
  }
  for(auto tv : {tv0, tv1, tv2}){
    tv->axis(0)->parallelize(ParallelType::BIDx);
    tv->axis(1)->parallelize(ParallelType::TIDx);
  }
  tv1->axis(2)->parallelize(ParallelType::Vectorize);

  inlineMost();

  auto at_dtype = data_type_to_aten(dtype);
  auto options = at::TensorOptions().dtype(at_dtype).device(at::kCUDA, 0);
  // randn deosn't support bool type
  at::Tensor t0 = at::ones({m, n}, options);

  FusionExecutor fe;

  // requires ampere+ GPU
  if (!deviceMajorMinorCheck(8)) {
    ASSERT_THAT(
        [&]() { fe.compileFusion(&fusion, {t0}); },
        testing::ThrowsMessage<nvfuser::nvfError>(testing::HasSubstr(
            "Reason: LoadStoreOpType::CpAsync requires Ampere")));
    GTEST_SKIP() << "skipping tests on pre-AMPERE GPUs";
  } else {
    fe.compileFusion(&fusion, {t0});
  }
  fe.compileFusion(&fusion, {t0});
  auto cg_outputs = fe.runFusion({t0});

  testValidate(&fusion, cg_outputs, {t0}, __LINE__, __FILE__);
}

The generated code is:

__global__ void nvfuser_none_f0_c0_r0_g0(Tensor<bool, 2, 2> T0, Tensor<int, 2, 2> T2) {
  alignas(16) extern __shared__ char array[];
  const unsigned smem_offset = 0;
  NVFUSER_DEFINE_MAGIC_ZERO;
  nvfuser_index_t i0;
  i0 = 4LL * ((nvfuser_index_t)threadIdx.x);
  nvfuser_index_t i1;
  i1 = 128LL * ((nvfuser_index_t)blockIdx.x);
  nvfuser_index_t i2;
  i2 = i0 + i1;
  bool b3;
  b3 = (3LL + i0) < 128LL;
  bool* T1 = reinterpret_cast<bool*>(array + smem_offset + 0LL);
  asm volatile(
    "{\n"
    "  .reg .pred p0; \n"
    "  setp.ne.b32 p0, %0, 0;\n"
    "  .reg .pred p1; \n"
    "  setp.ne.b32 p1, %1, 0;\n"
    "  .reg .pred p2; \n"
    "  setp.ne.b32 p2, %3, 0;\n"
    "  cp.async.ca.shared.global [%0], [%1], %2, p0;\n"
    "}\n"
    :
    :"r"((uint32_t)((uint32_t)((toSmem(T1) + i0)))),
     "l"((uint32_t)(((T0.data + i0) + i1))),
     "n"(4LL),
     "r"((uint32_t)((!b3)))
  );
  asm volatile("cp.async.wait_all;\n");
  #pragma unroll
  for(nvfuser_index_t i4 = 0LL; i4 < 4LL; ++i4) {
    if (b3) {
      T2[(i2 + (i4 + nvfuser_zero))]
         = (int32_t)(T1[(i0 + i4)]);
    }
  }
  NVFUSER_UPDATE_MAGIC_ZERO;
}

Durint lowering, the pointer to bool is processed as bool and being converted to uint32_t.

NVIDIA / Fuser

PTX code for async copy of bool type is not correctly generated #3273