NVIDIA / Fuser

A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
Other
260 stars 51 forks source link

Transform replay failed: Exception in thread pool task: !replay_has_rfactor_inp INTERNAL ASSERT FAILED #1023

Closed jjsjann123 closed 1 year ago

jjsjann123 commented 1 year ago

Repro script:

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id19(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[-1, -1, 1], contiguity=[True, True, None], dtype=DataType.Float, is_cpu=False)
    T1 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[True, True, True], dtype=DataType.Float, is_cpu=False)
    T2 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[None, None, True], dtype=DataType.Float, is_cpu=False)
    T3 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[True, True, True], dtype=DataType.Float, is_cpu=False)
    T4 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[True, True, True], dtype=DataType.Float, is_cpu=False)
    T5 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[True, True, True], dtype=DataType.Float, is_cpu=False)
    T6 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[True, True, None], dtype=DataType.Float, is_cpu=False)
    T7 = fd.define_tensor(shape=[-1, -1, -1], contiguity=[True, True, True], dtype=DataType.Float, is_cpu=False)
    S8 = fd.define_scalar(4, dtype=DataType.Int)
    S9 = fd.define_scalar(64, dtype=DataType.Int)
    S10 = fd.define_scalar(768, dtype=DataType.Int)
    V11 = fd.define_vector([S8, S9, S10], dtype=DataType.Int)
    T12 = fd.ops.broadcast_in_dim(T0, shape=V11, broadcast_dims=[0, 1, 2])
    T13 = fd.ops.mul(T1, T12)
    T14 = fd.ops.sum(T4, axes=[0, 1], keepdim=False, dtype=DataType.Null)
    S15 = fd.define_scalar(1.00000, dtype=DataType.Double)
    T16 = fd.ops.mul(S15, T5)
    T17 = fd.ops.sum(T16, axes=[0, 1], keepdim=False, dtype=DataType.Null)
    T18 = fd.ops.mul(T16, T2)
    T19 = fd.ops.mul(T16, T13)
    T20 = fd.ops.sum(T19, axes=[0, 1], keepdim=False, dtype=DataType.Null)
    T21 = fd.ops.mul(T18, T12)
    T22 = fd.ops.mul(T18, T1)
    T23 = fd.ops.sum(T22, axes=[2], keepdim=False, dtype=DataType.Null)
    S24 = fd.define_scalar(4, dtype=DataType.Int)
    S25 = fd.define_scalar(64, dtype=DataType.Int)
    S26 = fd.define_scalar(1, dtype=DataType.Int)
    V27 = fd.define_vector([S24, S25, S26], dtype=DataType.Int)
    T28 = fd.ops.broadcast_in_dim(T23, shape=V27, broadcast_dims=[0, 1])
    T29 = fd.ops.neg(T21)
    T30 = fd.ops.sum(T29, axes=[2], keepdim=False, dtype=DataType.Null)
    S31 = fd.define_scalar(4, dtype=DataType.Int)
    S32 = fd.define_scalar(64, dtype=DataType.Int)
    S33 = fd.define_scalar(1, dtype=DataType.Int)
    V34 = fd.define_vector([S31, S32, S33], dtype=DataType.Int)
    T35 = fd.ops.broadcast_in_dim(T30, shape=V34, broadcast_dims=[0, 1])
    S36 = fd.define_scalar(-0.500000, dtype=DataType.Double)
    T37 = fd.ops.mul(S36, T28)
    S38 = fd.define_scalar(3.00000, dtype=DataType.Double)
    T39 = fd.ops.pow(T0, S38)
    T40 = fd.ops.mul(T37, T39)
    S41 = fd.define_scalar(1.00000, dtype=DataType.Double)
    T42 = fd.ops.mul(S41, T40)
    T43 = fd.ops.sum(T35, axes=[2], keepdim=False, dtype=DataType.Null)
    T44 = fd.ops.sum(T42, axes=[2], keepdim=False, dtype=DataType.Null)
    S45 = fd.define_scalar(4, dtype=DataType.Int)
    S46 = fd.define_scalar(64, dtype=DataType.Int)
    S47 = fd.define_scalar(1, dtype=DataType.Int)
    V48 = fd.define_vector([S45, S46, S47], dtype=DataType.Int)
    T49 = fd.ops.broadcast_in_dim(T44, shape=V48, broadcast_dims=[0, 1])
    S50 = fd.define_scalar(4, dtype=DataType.Int)
    S51 = fd.define_scalar(64, dtype=DataType.Int)
    S52 = fd.define_scalar(768, dtype=DataType.Int)
    V53 = fd.define_vector([S50, S51, S52], dtype=DataType.Int)
    T54 = fd.ops.broadcast_in_dim(T49, shape=V53, broadcast_dims=[0, 1, 2])
    S55 = fd.define_scalar(2.00000, dtype=DataType.Double)
    T56 = fd.ops.mul(S55, T54)
    T57 = fd.ops.sub(T7, T6)
    T58 = fd.ops.mul(T56, T57)
    S59 = fd.define_scalar(768.000, dtype=DataType.Double)
    S60 = fd.ops.reciprocal(S59)
    T61 = fd.ops.mul(T58, S60)
    S62 = fd.define_scalar(4, dtype=DataType.Int)
    S63 = fd.define_scalar(64, dtype=DataType.Int)
    S64 = fd.define_scalar(1, dtype=DataType.Int)
    V65 = fd.define_vector([S62, S63, S64], dtype=DataType.Int)
    T66 = fd.ops.broadcast_in_dim(T43, shape=V65, broadcast_dims=[0, 1])
    S67 = fd.define_scalar(4, dtype=DataType.Int)
    S68 = fd.define_scalar(64, dtype=DataType.Int)
    S69 = fd.define_scalar(768, dtype=DataType.Int)
    V70 = fd.define_vector([S67, S68, S69], dtype=DataType.Int)
    T71 = fd.ops.broadcast_in_dim(T66, shape=V70, broadcast_dims=[0, 1, 2])
    S72 = fd.define_scalar(0.00130208, dtype=DataType.Double)
    T73 = fd.ops.mul(S72, T71)
    T74 = fd.ops.add(T61, T73)
    T75 = fd.ops.add(T21, T74)
    T76 = fd.ops.add(T3, T75)
    S77 = fd.define_scalar(1.00000, dtype=DataType.Double)
    T78 = fd.ops.mul(S77, T76)
    T79 = fd.ops.sum(T78, axes=[0], keepdim=False, dtype=DataType.Null)
    fd.add_output(T14)
    fd.add_output(T17)
    fd.add_output(T20)
    fd.add_output(T78)
    fd.add_output(T79)

with FusionDefinition() as fd:
    nvfuser_fusion_id19(fd)

inputs = [
    torch.randn((256,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 1), (64, 1, 1)),
    torch.randn((196608,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 768), (49152, 768, 1)),
    torch.randn((768,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 768), (0, 0, 1)),
    torch.randn((196608,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 768), (49152, 768, 1)),
    torch.randn((589824,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 2304), (147456, 2304, 1)),
    torch.randn((196608,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 768), (49152, 768, 1)),
    torch.randn((256,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 768), (64, 1, 0)),
    torch.randn((196608,), dtype=torch.float32, device='cuda:0').as_strided((4, 64, 768), (49152, 768, 1)),
]
fd.execute(inputs)
Exception in thread pool task: !replay_has_rfactor_inp INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/transform_iter.cpp":515, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Error during replay, a transformation was called that conflicts with an rfactor call.

frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x8d (0x7fb4bb7a1e47 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #1: nvfuser::nvfErrorFail(char const*, char const*, unsigned int, char const*, std::string const&) + 0x53 (0x7fb4bb8e0b13 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #2: nvfuser::BestEffortReplay::BestEffortReplay(std::vector<nvfuser::IterDomain*, std::allocator<nvfuser::IterDomain*> > const&, std::vector<nvfuser::IterDomain*, std::allocator<nvfuser::IterDomain*> > const&, std::unordered_map<nvfuser::IterDomain*, nvfuser::IterDomain*, std::hash<nvfuser::IterDomain*>, std::equal_to<nvfuser::IterDomain*>, std::allocator<std::pair<nvfuser::IterDomain* const, nvfuser::IterDomain*> > >, std::unordered_map<nvfuser::IterDomain*, nvfuser::IterDomain*, std::hash<nvfuser::IterDomain*>, std::equal_to<nvfuser::IterDomain*>, std::allocator<std::pair<nvfuser::IterDomain* const, nvfuser::IterDomain*> > >, std::unordered_map<nvfuser::IterDomain*, nvfuser::IterDomain*, std::hash<nvfuser::IterDomain*>, std::equal_to<nvfuser::IterDomain*>, std::allocator<std::pair<nvfuser::IterDomain* const, nvfuser::IterDomain*> > >, bool, bool, bool) + 0x1afa (0x7fb4bbcf84aa in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #3: nvfuser::BestEffortReplay::replayCasP(nvfuser::TensorView const*, nvfuser::TensorView const*, int, nvfuser::RootDomainMap const&, bool, bool, bool) + 0x6d5 (0x7fb4bbcfb8b5 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #4: nvfuser::TransformReplay::replayCasP(nvfuser::TensorView const*, nvfuser::TensorView const*, long, nvfuser::RootDomainMap const&, nvfuser::TransformReplayOptions) + 0x20e (0x7fb4bbd053fe in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #5: nvfuser::TransformReplay::replayCasP(nvfuser::TensorView const*, nvfuser::TensorView const*, long, nvfuser::TransformReplayOptions) + 0x52 (0x7fb4bbd07092 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #6: nvfuser::TransformPropagator::propagateP2C(nvfuser::TensorView*, nvfuser::TensorView*) + 0x105 (0x7fb4bbd071c5 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #7: nvfuser::MaxInfoSpanningTree::traverse(nvfuser::MaxInfoSpanningTree::Propagator*) + 0xce (0x7fb4bbb5159e in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #8: nvfuser::reduction_scheduler_utils::propagateTransformation(nvfuser::TensorView*, std::unordered_set<nvfuser::TensorView*, std::hash<nvfuser::TensorView*>, std::equal_to<nvfuser::TensorView*>, std::allocator<nvfuser::TensorView*> > const&) + 0xf1 (0x7fb4bbc6a691 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #9: nvfuser::scheduleInnerOuterPersistentKernel(nvfuser::Fusion*, nvfuser::ReductionParams const&) + 0xab1 (0x7fb4bbc546f1 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #10: nvfuser::InnerOuterPersistentKernelScheduler::schedule(nvfuser::Fusion*) + 0xc6 (0x7fb4bbc55ca6 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #11: nvfuser::FusionKernelRuntime::compileKernel(nvfuser::KernelArgumentHolder const&, nvfuser::SegmentedGroup*) + 0x118 (0x7fb4bba36e78 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #12: <unknown function> + 0x43e0a7 (0x7fb4bba370a7 in /usr/local/lib/python3.10/site-packages/torch/lib/libnvfuser_codegen.so)
frame #13: c10::ThreadPool::main_loop(unsigned long) + 0x2b3 (0x7fb54878add3 in /usr/local/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #14: <unknown function> + 0xdc253 (0x7fb5482b0253 in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #15: <unknown function> + 0x94b43 (0x7fb549307b43 in /usr/lib/x86_64-linux-gnu/libc.so.6)
frame #16: <unknown function> + 0x126a00 (0x7fb549399a00 in /usr/lib/x86_64-linux-gnu/libc.so.6)
liqiangxl commented 1 year ago

In this case, we have an inner reduction of [I, I, R] and the outer reduction is [R, I, I]. axis-1 is a iteration domain in inner reduction tv but it is not a reduction domain in outer reduction tv, So we can't do outer reduction while looping over iteration domains of inner reduction. Current scheduler only support cases where every iteration domain in inner reduction tv is a reduction domain in outer reduction tv, e.g. inner reduction is [I, I, R] and outer reduction is [R, R, I]. As a tmp fix, the innerOuterPersistent scheduler is disabled if the domain map fails, the support for the patten in this case will be added in the future.

csarofeen commented 1 year ago

@liqiangxl which PR fixed this issue?

liqiangxl commented 1 year ago

@liqiangxl which PR fixed this issue?

https://github.com/NVIDIA/Fuser/pull/1024