NVIDIA / Fuser

A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
Other
271 stars 53 forks source link

Non-deterministic output tensor names #3396

Closed naoyam closed 1 week ago

naoyam commented 1 week ago

This showed up here, but I remember I have seen something like this time to time.

Repro (modified from NVFuserTest.FusionVarMean_CUDA):

TEST_F(NVFuserTest, NonDeterministicOutputNameRepro) {
  auto repro = [] () {
    auto fusion = std::make_unique<Fusion>();
    FusionGuard fg(fusion.get());

    int M = 64, N = 128;

    auto tv0 = makeSymbolicTensor(2);
    fusion->addInput(tv0);
    auto tvs = variance_mean(tv0, {1}, 0, true);
    auto tv_mean = tvs.mean;
    auto tv_var = tvs.var;
    fusion->addOutput(tv_var);
    fusion->addOutput(tv_mean);

    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
    at::Tensor t0 = at::randn({M, N}, options);

    FusionExecutorCache executor_cache(std::move(fusion));
    auto outputs = executor_cache.runFusionWithInputs({t0});
  };

  int num_iters = 10;
  if (getenv("NUM_ITERS")) {
    num_iters = std::atoi(getenv("NUM_ITERS"));
  }

  for (const auto i: c10::irange(num_iters)) {
    (void)i;
    repro();
  }
}

Run the repro as:

 NVFUSER_DUMP=cuda_kernel ./bin/nvfuser_tests --gtest_filter='*NonDeterministicOutputNameRepro' 2>&1 |tee log 

And then:

grep __global__ log                                                                                                                                                           
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T8, Tensor<float, 1, 1> T7) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T7, Tensor<float, 1, 1> T8) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T8, Tensor<float, 1, 1> T7) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T8, Tensor<float, 1, 1> T7) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T8, Tensor<float, 1, 1> T7) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T8, Tensor<float, 1, 1> T7) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T8, Tensor<float, 1, 1> T7) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T7, Tensor<float, 1, 1> T8) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T8, Tensor<float, 1, 1> T7) {
__global__ void nvfuser_inner_persistent_f0_c1_r0_g2(Tensor<float, 2, 2> T0, Tensor<float, 1, 1> T7, Tensor<float, 1, 1> T8) {

As you can see, the parameter list is either T0, T8, T7 or T0, T7, T8.

I'll add more info but I suspect this happens before segmentation.

naoyam commented 1 week ago

MarkAliasesPreparePass seems to insert SegmenterSet in a non-deterministic order. After the pass, the fusion looks like below:

%kernel_math {
T1_l_float[ iS2{i0}, rS3{i2} ](Avg),
T2_l_float[ iS4{i0}, rS5{i2} ](Var),
T3_l_nvfuser_index_t[ iS6{i0}, rS7{i2} ](Count)
 = Welford ( T0_g_float[ iS0{i0}, iS1{i2} ](Avg),
  allreduce = false )
d4 = (double)(i2);
d6 = double(1) * d4;
d10 = (double)(0);
d12 = d6 - d10;
d14 = (double)(0);
b16 = d12 >= d14;
d18 = (double)(0);
d20 = where(b16, d12, d18);
d26 = reciprocal(d20);
T4_l_float[ iS8{i0} ]
   = T2_l_float[ iS4{i0}, rS5{i2} ]
   * d26;
T7_l_float[ iS13{i0} ]
   = SegmenterSet( T4_l_float[ iS8{i0} ] )
T5_g_float[ iS9{i0}, bS10{1} ]
   = broadcast( T7_l_float[ iS13{i0} ] )
T8_l_float[ iS14{i0} ]
   = SegmenterSet( T1_l_float[ iS2{i0}, rS3{i2} ] )
T6_g_float[ iS11{i0}, bS12{1} ]
   = broadcast( T8_l_float[ iS14{i0} ] )
} // %kernel_math

However, sometimes it looks like:

%kernel_math {
T1_l_float[ iS2{i0}, rS3{i2} ](Avg),
T2_l_float[ iS4{i0}, rS5{i2} ](Var),
T3_l_nvfuser_index_t[ iS6{i0}, rS7{i2} ](Count)
 = Welford ( T0_g_float[ iS0{i0}, iS1{i2} ](Avg),
  allreduce = false )
d4 = (double)(i2);
d6 = double(1) * d4;
d10 = (double)(0);
d12 = d6 - d10;
d14 = (double)(0);
b16 = d12 >= d14;
d18 = (double)(0);
d20 = where(b16, d12, d18);
d26 = reciprocal(d20);
T4_l_float[ iS8{i0} ]
   = T2_l_float[ iS4{i0}, rS5{i2} ]
   * d26;
T8_l_float[ iS14{i0} ]
   = SegmenterSet( T4_l_float[ iS8{i0} ] )
T5_g_float[ iS9{i0}, bS10{1} ]
   = broadcast( T8_l_float[ iS14{i0} ] )
T7_l_float[ iS13{i0} ]
   = SegmenterSet( T1_l_float[ iS2{i0}, rS3{i2} ] )
T6_g_float[ iS11{i0}, bS12{1} ]
   = broadcast( T7_l_float[ iS13{i0} ] )
} // %kernel_math

Notice that the first case has:

T7_l_float[ iS13{i0} ]
   = SegmenterSet( T4_l_float[ iS8{i0} ] )
T8_l_float[ iS14{i0} ]
   = SegmenterSet( T1_l_float[ iS2{i0}, rS3{i2} ] )

But in the second case:

T8_l_float[ iS14{i0} ]
   = SegmenterSet( T4_l_float[ iS8{i0} ] )
T7_l_float[ iS13{i0} ]
   = SegmenterSet( T1_l_float[ iS2{i0}, rS3{i2} ] )

@wujingyue, is this something you could look into?

wujingyue commented 1 week ago

Sure! I suspect https://github.com/NVIDIA/Fuser/blob/030c2ba28aaf22fbdbbcd8c490a705c289ada6a4/csrc/preseg_passes/mark_aliases_prepare.cpp#L191-L194, but give me some time to confirm that...