NVIDIA / Fuser

A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
Other
244 stars 46 forks source link

bug in propagateReshapeTransforms #2593

Closed liqiangxl closed 1 month ago

liqiangxl commented 1 month ago

To reproduce use: repro The following fusion failed with err msg C++ exception with description "maybe_unmapped_ids.count(p_id) INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/transform_replay.cpp":603, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Could not find axis, iS34{( ceilDiv(i0, 3) )}rf, requested in replaying consumer T6_g[ iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )} ] as producer T9_l[ iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf ] Exception raised from replayCasP at /opt/pytorch/nvfuser/csrc/transform_replay.cpp:603


TEST_F(GpuViewTest, SplitMergePointwiseSplitMerge) {
  auto fusion = std::make_unique<Fusion>();
  FusionGuard fg(fusion.get());
  const std::vector<int64_t> input_shape = {12, 20};
  DataType dtype = DataType::Float;
  auto tv0 = makeContigTensor(input_shape.size(), dtype);
  fusion->addInput(tv0);
  auto tv1 = castOp(DataType::Float, tv0);
  // root domain : (i0, i2)
  // logi domain : (3, i0/3, 4, i2/4)
  auto tv2 = reshape(tv1, {12, 20}, {3, 4, 4, 5});
  // root domain : (3, i0/3, 4, i2/4)
  // logi domain : (3, i0/3*4, i2/4)
  auto tv3 = reshape(tv2, {3, 4, 4, 5}, {3, 16, 5});
  // root domain : (3, i0/3*4, i2/4)
  auto tv4 = mul(tv3, tv3);
  // root domain : (i0, i2)
  // logi domain : (3, i0/3, 4, i2/4)
  auto tv5 = reshape(tv1, {12, 20}, {3, 4, 4, 5});
  // root domain : (3, i0/3, 4, i2/4)
  // logi domain : (3, i0/3*4, i2/4)
  auto tv6 = reshape(tv5, {3, 4, 4, 5}, {3, 16, 5});
  fusion->addOutput(tv4);
  fusion->addOutput(tv6);

  auto options =
      at::TensorOptions().dtype(data_type_to_aten(dtype)).device(at::kCUDA, 0);
  auto t0 = at::randn(input_shape, options);

  FusionExecutorCache executor_cache(std::move(fusion));
  auto cg_outputs = executor_cache.runFusionWithInputs({t0});
}

Some debug info

Propagating transformations from T2_l[ iS7{3}rf, iS11{( ceilDiv(i2, 4) )}rf, iS8{( ceilDiv(i0, 3) )}rf, iS10{4}rf ]
Error in replayCasP consumer T6_g[ iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )} ]
 logical domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})
 allocation domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})

Error in replayCasP consumer T9_l[ iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf ]
 root domain : (iS30{3}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf, iS33{( ceilDiv(i2, 4) )})
  Merge: iS34{( ceilDiv(i0, 3) )}rf and iS35{4}rf -> iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf
 logical domain : (iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )})
 allocation domain : (iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf)
liqiangxl commented 1 month ago

Fusion before propagateReshapeTransforms


%kernel {
T7_l[ iS39{i0}, iS40{i2} ]
   = Set( T0_g[ iS0{i0}, iS1{i2} ], cache_op=Streaming )
T1_l[ iS2{i0}, iS3{i2} ]
   = Set( T7_l[ iS39{i0}, iS40{i2} ], cache_op=Streaming )
T2_l[ iS7{3}rf, iS8{( ceilDiv(i0, 3) )}rf, iS10{4}rf, iS11{( ceilDiv(i2, 4) )}rf ] = view( T1_l[ iS2{i0}, iS3{i2} ] )
T3_l[ iS12{3}, iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS15{( ceilDiv(i2, 4) )} ] = view( T2_l[ iS7{3}rf, iS8{( ceilDiv(i0, 3) )}rf, iS10{4}rf, iS11{( ceilDiv(i2, 4) )}rf ] )
T8_l[ iS19{3}, iS20{( ( ceilDiv(i0, 3) ) * 4 )}, iS21{( ceilDiv(i2, 4) )} ]
   = T3_l[ iS12{3}, iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS15{( ceilDiv(i2, 4) )} ]
   * T3_l[ iS12{3}, iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS15{( ceilDiv(i2, 4) )} ];
T4_g[ iS41{3}, iS42{( ( ceilDiv(i0, 3) ) * 4 )}, iS43{( ceilDiv(i2, 4) )} ]
   = Set( T8_l[ iS19{3}, iS20{( ( ceilDiv(i0, 3) ) * 4 )}, iS21{( ceilDiv(i2, 4) )} ], cache_op=Streaming )
T5_l[ iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf ] = view( T1_l[ iS2{i0}, iS3{i2} ] )
T9_l[ iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )} ] = view( T5_l[ iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf ] )
T6_g[ iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )} ]
   = Set( T9_l[ iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )} ], cache_op=Streaming )

TransformPrinter : 
T0_g[ iS0{i0}, iS1{i2} ]
 logical domain : (iS0{i0}, iS1{i2})
 contiguity: t t
 loop domain : (iS0{i0}, iS1{i2})
T7_l[ iS39{i0}, iS40{i2} ]
 logical domain : (iS39{i0}, iS40{i2})
 contiguity: t t
 loop domain : (iS39{i0}, iS40{i2})
T1_l[ iS2{i0}, iS3{i2} ]
 logical domain : (iS2{i0}, iS3{i2})
 allocation domain : (iS2{i0}, iS3{i2})
 contiguity: t t
 loop domain : (iS2{i0}, iS3{i2})
T2_l[ iS7{3}rf, iS8{( ceilDiv(i0, 3) )}rf, iS10{4}rf, iS11{( ceilDiv(i2, 4) )}rf ]
 root domain : (iS6{i0}rf, iS9{i2}rf)
  Outer split: iS6{i0}rf by factor 3 -> iS7{3}rf, iS8{( ceilDiv(i0, 3) )}rf
  Outer split: iS9{i2}rf by factor 4 -> iS10{4}rf, iS11{( ceilDiv(i2, 4) )}rf
 logical domain : (iS7{3}rf, iS8{( ceilDiv(i0, 3) )}rf, iS10{4}rf, iS11{( ceilDiv(i2, 4) )}rf)
 allocation domain : (iS7{3}rf, iS8{( ceilDiv(i0, 3) )}rf, iS10{4}rf, iS11{( ceilDiv(i2, 4) )}rf)
 contiguity: t t t t
 loop domain : (iS7{3}rf, iS8{( ceilDiv(i0, 3) )}rf, iS10{4}rf, iS11{( ceilDiv(i2, 4) )}rf)
T3_l[ iS12{3}, iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS15{( ceilDiv(i2, 4) )} ]
 root domain : (iS12{3}, iS16{( ceilDiv(i0, 3) )}rf, iS17{4}rf, iS15{( ceilDiv(i2, 4) )})
  Merge: iS16{( ceilDiv(i0, 3) )}rf and iS17{4}rf -> iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf
 logical domain : (iS12{3}, iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS15{( ceilDiv(i2, 4) )})
 allocation domain : (iS12{3}, iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS15{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS12{3}, iS18{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS15{( ceilDiv(i2, 4) )})
T8_l[ iS19{3}, iS20{( ( ceilDiv(i0, 3) ) * 4 )}, iS21{( ceilDiv(i2, 4) )} ]
 logical domain : (iS19{3}, iS20{( ( ceilDiv(i0, 3) ) * 4 )}, iS21{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS19{3}, iS20{( ( ceilDiv(i0, 3) ) * 4 )}, iS21{( ceilDiv(i2, 4) )})
T4_g[ iS41{3}, iS42{( ( ceilDiv(i0, 3) ) * 4 )}, iS43{( ceilDiv(i2, 4) )} ]
 logical domain : (iS41{3}, iS42{( ( ceilDiv(i0, 3) ) * 4 )}, iS43{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS41{3}, iS42{( ( ceilDiv(i0, 3) ) * 4 )}, iS43{( ceilDiv(i2, 4) )})
T5_l[ iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf ]
 root domain : (iS24{i0}rf, iS27{i2}rf)
  Outer split: iS24{i0}rf by factor 3 -> iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf
  Outer split: iS27{i2}rf by factor 4 -> iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf
 logical domain : (iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf)
 allocation domain : (iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf)
 contiguity: t t t t
 loop domain : (iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf)
T9_l[ iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )} ]
 root domain : (iS30{3}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf, iS33{( ceilDiv(i2, 4) )})
  Merge: iS34{( ceilDiv(i0, 3) )}rf and iS35{4}rf -> iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf
 logical domain : (iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )})
 allocation domain : (iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )})
T6_g[ iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )} ]
 logical domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})
 allocation domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})
} // %kernel
liqiangxl commented 1 month ago

Err is caused by incorrect replay pos. From T1 to T5, the correct pos should be 2 but returned as 4. Further replay of T5 -> T9 and T9 -> T6 with pos=4 caused the original error.

TransformPropagator::propagateP2C
  from: T1_l[ iS47{3}, iS50{( ceilDiv(i2, 4) )}, iS48{( ceilDiv(i0, 3) )}, iS49{4} ] @ 2
  to: T5_l[ iS25{3}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf, iS29{( ceilDiv(i2, 4) )}rf ]
  new_pos: -1
  TransformReplay::replayCasP: 
  producer_pos: 2
  target_producer_ids: iS47{3}
  target_producer_ids: iS50{( ceilDiv(i2, 4) )}
  forwarded_replay_map map: iS50{( ceilDiv(i2, 4) )} : iS29{( ceilDiv(i2, 4) )}rf
  forwarded_replay_map map: iS47{3} : iS25{3}rf
  forwarded_replay_map map: iS48{( ceilDiv(i0, 3) )} : iS26{( ceilDiv(i0, 3) )}rf
  forwarded_replay_map map: iS49{4} : iS28{4}rf
  replay map: iS50{( ceilDiv(i2, 4) )} : iS29{( ceilDiv(i2, 4) )}rf
  replay map: iS47{3} : iS25{3}rf
  replay map: iS48{( ceilDiv(i0, 3) )} : iS26{( ceilDiv(i0, 3) )}rf
  replay map: iS49{4} : iS28{4}rf
  type-1 new ids: iS25{3}rf
  type-1 new ids: iS29{( ceilDiv(i2, 4) )}rf
  type-2 new ids: iS26{( ceilDiv(i0, 3) )}rf
  type-2 new ids: iS28{4}rf
  replayed: T5_l[ iS25{3}rf, iS29{( ceilDiv(i2, 4) )}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf ] @ 4

TransformPropagator::propagateP2C
  from: T5_l[ iS25{3}rf, iS29{( ceilDiv(i2, 4) )}rf, iS26{( ceilDiv(i0, 3) )}rf, iS28{4}rf ] @ 4
  to: T9_l[ iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )} ]
  new_pos: -1
  TransformReplay::replayCasP: 
  producer_pos: 4
  target_producer_ids: iS25{3}rf
  target_producer_ids: iS29{( ceilDiv(i2, 4) )}rf
  target_producer_ids: iS26{( ceilDiv(i0, 3) )}rf
  target_producer_ids: iS28{4}rf
  forwarded_replay_map map: iS25{3}rf : iS30{3}
  forwarded_replay_map map: iS26{( ceilDiv(i0, 3) )}rf : iS34{( ceilDiv(i0, 3) )}rf
  forwarded_replay_map map: iS28{4}rf : iS35{4}rf
  forwarded_replay_map map: iS29{( ceilDiv(i2, 4) )}rf : iS33{( ceilDiv(i2, 4) )}
  replay map: iS25{3}rf : iS30{3}
  replay map: iS26{( ceilDiv(i0, 3) )}rf : iS34{( ceilDiv(i0, 3) )}rf
  replay map: iS28{4}rf : iS35{4}rf
  replay map: iS29{( ceilDiv(i2, 4) )}rf : iS33{( ceilDiv(i2, 4) )}
  type-1 new ids: iS30{3}
  type-1 new ids: iS33{( ceilDiv(i2, 4) )}
  type-1 new ids: iS34{( ceilDiv(i0, 3) )}rf
  type-1 new ids: iS35{4}rf
  replayed: T9_l[ iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf ] @ 4

TransformPropagator::propagateP2C
  from: T9_l[ iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf ] @ 4
  to: T6_g[ iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )} ]
  new_pos: -1
  TransformReplay::replayCasP: 
  producer_pos: 4
  target_producer_ids: iS30{3}
  target_producer_ids: iS33{( ceilDiv(i2, 4) )}
  target_producer_ids: iS34{( ceilDiv(i0, 3) )}rf
  target_producer_ids: iS35{4}rf
  forwarded_replay_map map: iS30{3} : iS44{3}
  forwarded_replay_map map: iS33{( ceilDiv(i2, 4) )} : iS46{( ceilDiv(i2, 4) )}
  replay map: iS30{3} : iS44{3}
  replay map: iS33{( ceilDiv(i2, 4) )} : iS46{( ceilDiv(i2, 4) )}
Error in replayCasP consumer T6_g[ iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )} ]
 logical domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})
 allocation domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )})

Error in replayCasP consumer T9_l[ iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf ]
 root domain : (iS30{3}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf, iS33{( ceilDiv(i2, 4) )})
  Merge: iS34{( ceilDiv(i0, 3) )}rf and iS35{4}rf -> iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf
 logical domain : (iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )})
 allocation domain : (iS30{3}, iS36{( ( ceilDiv(i0, 3) ) * 4 )}rf, iS33{( ceilDiv(i2, 4) )})
 contiguity: t t t
 loop domain : (iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf)
unknown file: Failure
C++ exception with description "maybe_unmapped_ids.count(p_id) INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/transform_replay.cpp":633, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Could not find axis, iS34{( ceilDiv(i0, 3) )}rf, requested in replaying consumer T6_g[ iS44{3}, iS45{( ( ceilDiv(i0, 3) ) * 4 )}, iS46{( ceilDiv(i2, 4) )} ] as producer T9_l[ iS30{3}, iS33{( ceilDiv(i2, 4) )}, iS34{( ceilDiv(i0, 3) )}rf, iS35{4}rf ]
Exception raised from replayCasP at /opt/pytorch/nvfuser/csrc/transform_replay.cpp:633 (most recent call first):