NVIDIA / Fuser

A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
Other
268 stars 52 forks source link

Invalid unswitch predicates with double buffering #2159

Closed naoyam closed 3 months ago

naoyam commented 6 months ago

Looks like we are not generating correct predicates when unswitching double buffered loops. The generated code with DoubleBufferingTest.DoubleBuffer5 seems incorrect. Here's a simplified repro based on that test:

TEST_F(DoubleBufferingTest, UnswitchRepro) {
  Fusion fusion;
  FusionGuard fg(&fusion);

  auto tv0 = makeContigTensor(1);
  fusion.addInput(tv0);

  auto tv1 = set(tv0);
  auto tv2 = add(tv1, IrBuilder::create<Val>(1.0));
  fusion.addOutput(tv2);

  tv2->split(-1, 8);
  tv2->split(0, 1, false);
  TransformPropagatorWithCheck propagator(tv2);
  MaxRootDomainInfoSpanningTree(tv2).traverse(&propagator);

  tv1->inlineAt(2);

  tv2->axis(0)->parallelize(ParallelType::Unswitch);
  scheduler_utils::parallelizeAllLike(tv2);

  tv1->doubleBuffer();

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  auto t0 = at::randn({16}, options);

  FusionExecutor fe;
  fe.compileFusion(&fusion, {t0});
  auto cg_outputs = fe.runFusion({t0});

  auto ref = t0 + 1;

  testValidate(&fusion, cg_outputs, {t0}, {ref}, __LINE__, __FILE__);
}

Here's the unswitched branch of the generated code:

 nvfuser_index_t i0;
  i0 = ceilDiv(T0.logical_size[0LL], 8LL);
 if (((-1LL + (8LL * i0)) < T0.logical_size[0LL])) {
    float T1[16LL];
    #pragma unroll
    for(nvfuser_index_t i1 = 0; i1 < 8LL; ++i1) {
      T1[i1]
         = T0[(i1 + nvfuser_zero)];
    }
    NVFUSER_UPDATE_MAGIC_ZERO;
    #pragma unroll 1
    for(nvfuser_index_t i2 = 0; i2 < i0; ++i2) {
      nvfuser_index_t i3;
      i3 = 8LL * i2;
      nvfuser_index_t i4;
      i4 = 8LL + i3;
      nvfuser_index_t i5;
      i5 = 8LL * ((1LL + i2) % 2LL);
      nvfuser_index_t i6;
      i6 = 8LL * (i2 % 2LL);
      #pragma unroll
      for(nvfuser_index_t i1 = 0; i1 < 8LL; ++i1) {
        T1[(i5 + i1)]
           = T0[(i4 + (i1 + nvfuser_zero))];
      }
      NVFUSER_UPDATE_MAGIC_ZERO;
      #pragma unroll
      for(nvfuser_index_t i7 = 0; i7 < 8LL; ++i7) {
        T2[(i3 + (i7 + nvfuser_zero))]
          = T1[(i6 + i7)]
          + (float) 1.00000000000000000e+00;
      }
      NVFUSER_UPDATE_MAGIC_ZERO;
    }

The test doesn't fail, but the read from T0 can overrun the buffer. Indeed, running this test with compute-sanitizer does fail at that point.

I haven't checked if this also applies to circular buffering, but I suspect that's highly likely.

naoyam commented 6 months ago

Fixed in the new indexing branch: https://github.com/NVIDIA/Fuser/tree/IterDomainGraphs_indexing