csarofeen / pytorch

Tensors and Dynamic neural networks in Python with strong GPU acceleration
http://pytorch.org
Other
26 stars 7 forks source link

Compilation error on reduction of expanded input #2165

Open zasdfgbnm opened 1 year ago

zasdfgbnm commented 1 year ago

🐛 Describe the bug

TEST_F(NVFuserTest, FusionExpandedInput2_CUDA) {
  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
  auto fusion = fusion_ptr.get();
  FusionGuard fg(fusion);

  TensorView* tv0 = TensorViewBuilder()
                        .ndims(3)
                        .shape({-1, -1, -1})
                        .contiguity({false, false, true})
                        .expanded({false, true, false})
                        .build();
  fusion->addInput(tv0);
  auto tv1 = sum(tv0, {1});
  fusion->addOutput(tv1);

  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
  at::Tensor t0 = at::randn({4096, 1, 1024}, options).expand({-1, 7, -1});

  FusionExecutorCache fec(std::move(fusion_ptr));
  auto cg_outputs = fec.runFusionWithInputs({t0});

  testValidate(fusion, cg_outputs, {t0}, {t0.sum(1)}, __LINE__, __FILE__);
}

error

CUDA NVRTC compile error: __tmp_kernel1.cu(7997): error: identifier "i1" is undefined

Versions

devel

zasdfgbnm commented 1 year ago

Generated code:

__global__ void kernel1(Tensor<float, 3> T0, Tensor<float, 2> T1) {
  NVFUSER_DEFINE_MAGIC_ZERO
  int i108;
  i108 = (((nvfuser_index_t)blockIdx.x) * ((nvfuser_index_t)blockDim.x)) + ((nvfuser_index_t)threadIdx.x);
  int i50;
  i50 = ((((nvfuser_index_t)blockIdx.x) * ((nvfuser_index_t)blockDim.x)) + ((nvfuser_index_t)threadIdx.x)) / T0.size[2];
  int i51;
  i51 = ((((nvfuser_index_t)blockIdx.x) * ((nvfuser_index_t)blockDim.x)) + ((nvfuser_index_t)threadIdx.x)) % T0.size[2];
  float T2[1];
  if ((i108 < (T0.size[0] * T0.size[2]))) {
    T2[0]
       = T0[(i50 * T0.stride[0]) + i51];
  } else {
    if ((i108 < (T0.size[0] * T0.size[2]))) {
      T2[0]
         = T0[(i50 * T0.stride[0]) + i51];
    }
  }
  float T4[1];
  T4[0] = 0.00000000000000000e+00;
  #pragma unroll 1
  for(nvfuser_index_t i38 = 0; i38 < (ceilDiv((ceilDiv(i1, 4)), 1)); ++i38) {
    if (((i108 < (T0.size[0] * T0.size[2])) && (((i38 * 4) + 3) < i1))) {
      #pragma unroll
      for(nvfuser_index_t i40 = 0; i40 < 4; ++i40) {
        T4[0]
          = T4[0]
          + T2[0];
      }
      NVFUSER_UPDATE_MAGIC_ZERO
    } else {
      #pragma unroll
      for(nvfuser_index_t i40 = 0; i40 < 4; ++i40) {
        if (((i108 < (T0.size[0] * T0.size[2])) && (((i38 * 4) + (i40 + nvfuser_zero)) < i1))) {
          T4[0]
            = T4[0]
            + T2[0];
        }
      }
      NVFUSER_UPDATE_MAGIC_ZERO
    }
  }
  float T3[1];
  T3[0] = 0.00000000000000000e+00;
  T3[0]
    = T3[0]
    + T4[0];
  if ((i108 < (T0.size[0] * T0.size[2]))) {
    T1[i108]
       = T3[0];
  }
}