NVIDIA / Fuser

A Fusion Code Generator for NVIDIA GPUs (commonly known as "nvFuser")
Other
263 stars 52 forks source link

Segfault when gathering broadcasted inputs #93

Open jacobhinkle opened 1 year ago

jacobhinkle commented 1 year ago

The following code results in a segfault as of yesterday (e.g. commit 1a5db862df21e5dabaeb0f3648a012ea60cee8c3)

import torch
import torch.nn.functional as F
import nvfuser

def test_embedding(
        vocab_size=4096,
        embedding_dim=512,
        sentence_len=1024,
        batch_size=128,
    ):
    terms = torch.randint(0, vocab_size, [batch_size], dtype=torch.long, device='cuda')
    embedding_table = torch.randn([vocab_size, embedding_dim], dtype=torch.float32, device='cuda')

    torch_embedded = F.embedding(terms, embedding_table)  # successful

    with nvfuser.FusionDefinition() as fd:
        t = fd.from_pytorch(terms)
        e = fd.from_pytorch(embedding_table)

        # look up each term's embedding and output the embedded terms
        tb = fd.ops.broadcast(t, [False, True, True])  # bcast in term and embedding dims
        eb = fd.ops.broadcast(e, [True, False, False])  # bcast in batch dimension
        embedded = fd.ops.gather(eb, tb, dim=1)

        fd.add_output(embedded)

    nvf_out, _ = fd.execute([terms, embedding_table])
    embedded = nvf_out[0].squeeze()

test_embedding()

A partial backtrace is here:

#0  0x00007fff6c003818 in std::vector<nvfuser::Val*, std::allocator<nvfuser::Val*> >::size (this=0x48)                                   
    at /usr/include/c++/9/bits/stl_vector.h:916                                                                                          
#1  0x00007fff6c595e76 in nvfuser::ComputeAtRootDomainMapBuilder::initializeBcastMap (this=0x7fffffffb110, tv=0x55555c0e6890,            
    id=0x55555c168100) at /opt/pytorch/nvfuser/csrc/root_domain_map.cpp:792                                                
#2  0x00007fff6c598eb0 in nvfuser::ComputeAtRootDomainMapBuilder::handle (this=0x7fffffffb110, tv=0x55555c0e6890)
    at /opt/pytorch/nvfuser/csrc/root_domain_map.cpp:1195                                                                                
#3  0x00007fff6c079a9c in nvfuser::Val::dispatch<nvfuser::OptOutDispatch*> (handler=0x7fffffffb110, val=0x55555c0e6890)
    at /opt/pytorch/nvfuser/csrc/dispatch.cpp:92            
#4  0x00007fff6c0768a1 in nvfuser::OptOutDispatch::handle (this=0x7fffffffb110, v=0x55555c0e6890)                                  
    at /opt/pytorch/nvfuser/csrc/dispatch.cpp:762           
#5  0x00007fff6c331747 in nvfuser::BackwardVisitor::handle (this=0x7fffffffb110, val=0x55555c0e6890)                         
    at /opt/pytorch/nvfuser/csrc/iter_visitor.cpp:396
#6  0x00007fff6c079165 in nvfuser::Statement::dispatch<nvfuser::OptOutDispatch*> (handler=0x7fffffffb110, stmt=0x55555c0e6890)
    at /opt/pytorch/nvfuser/csrc/dispatch.cpp:330    
#7  0x00007fff6c07684d in nvfuser::OptOutDispatch::handle (this=0x7fffffffb110, s=0x55555c0e6890)            
    at /opt/pytorch/nvfuser/csrc/dispatch.cpp:754    
#8  0x00007fff6c3316f3 in nvfuser::BackwardVisitor::handle (this=0x7fffffffb110, stmt=0x55555c0e6890)           
    at /opt/pytorch/nvfuser/csrc/iter_visitor.cpp:388                                                                                    
#9  0x00007fff6c331d79 in nvfuser::BackwardVisitor::traverseTo (this=0x7fffffffb110, fusion=0x55555a10de90,
    from=std::vector of length 1, capacity 1 = {...}, traverseAllPaths=false) at /opt/pytorch/nvfuser/csrc/iter_visitor.cpp:462
#10 0x00007fff6c595930 in nvfuser::ComputeAtRootDomainMapBuilder::ComputeAtRootDomainMapBuilder (this=0x7fffffffb110, root_map=..., 
    map_through_reduction=false) at /opt/pytorch/nvfuser/csrc/root_domain_map.cpp:758
#11 0x00007fff6c593b13 in nvfuser::ComputeAtRootDomainMap::build (this=0x7fffffffb3f0, map_through_reduction=false)
    at /opt/pytorch/nvfuser/csrc/root_domain_map.cpp:483         
#12 0x00007fff6bfe3d0e in nvfuser::MaxPosCalculator::buildUnmappableDims (this=0x7fffffffb650, compute_at_only=false)
    at /opt/pytorch/nvfuser/csrc/inlining.cpp:31
#13 0x00007fff6bfe3c79 in nvfuser::MaxPosCalculator::MaxPosCalculator (this=0x7fffffffb650, 
    uninlinable_ids=std::unordered_set with 0 elements, compute_at_only=false) at /opt/pytorch/nvfuser/csrc/inlining.cpp:21
#14 0x00007fff6bfe55d5 in nvfuser::inlineAllAt (reference_tv=0x55555c0fe5a0, reference_pos=2, best_effort=true, 
    uninlinable_ids=std::unordered_set with 0 elements) at /opt/pytorch/nvfuser/csrc/inlining.cpp:290
#15 0x00007fff6c5b8c54 in nvfuser::schedulePointwise (fusion=0x55555a10de90, params=...)
    at /opt/pytorch/nvfuser/csrc/scheduler/pointwise.cpp:789
#16 0x00007fff6c60757d in nvfuser::(anonymous namespace)::PointWiseScheduler::schedule (this=0x55555bdb4860, fusion=0x55555a10de90)
    at /opt/pytorch/nvfuser/csrc/scheduler/registry.cpp:1598
#17 0x00007fff6c357abe in nvfuser::FusionKernelRuntime::runKernelWithInput (this=0x55555aacc470, args=..., sg=0x55555a9b2800)
    at /opt/pytorch/nvfuser/csrc/kernel_cache.cpp:365
#18 0x00007fff6c359f55 in nvfuser::FusionKernelRuntime::runWithInput (this=0x55555aacc470, args=...)
    at /opt/pytorch/nvfuser/csrc/kernel_cache.cpp:659

The problem comes from dereferencing tv->definition() without checking for nullptr at https://github.com/NVIDIA/Fuser/blob/main/csrc/root_domain_map.cpp#L794. Changing this line to

 (tv->definition() && tv->definition()->outputs().size() > 1) ||

raises an informative (but uncaught) exception and the python script exits with a RuntimeError.

naoyam commented 1 year ago

Thanks for reporting. Was the repro working before?

jacobhinkle commented 1 year ago

No I haven't got it working on any commit. I think the code in question was introduced in October: https://github.com/csarofeen/pytorch/pull/2072/files#diff-147f701ca808989bf9ad700751c261c21fc30cea6c0f434001cdc4c2ce0be042R767

naoyam commented 1 year ago

Actually, that wasn't the original commit. The same code was already there. Looks like I added that: https://github.com/csarofeen/pytorch/commit/6d14059cd44247de6af8705e8ba843b65fe638e6

Will look into it.

jjsjann123 commented 1 year ago

Lol, also run into this with cross_entropy, where we have indices target with weight running with mean reduction.

This pattern happens when we take_along_axis into weight to compute the divisor for mean

naoyam commented 1 year ago

@jacobhinkle Is this fixed now that #192 is merged?

jacobhinkle commented 1 year ago

Getting same segfault. If I guard the nullptr deref at root_domain_map.cpp:753 then that TORCH_INTERNAL_ASSERT still fails.

naoyam commented 1 year ago

(Note to myself) #206 has a disabled test. Make sure to check the test as well

jacobhinkle commented 1 year ago

The following C++ version uses take_along_axis to compute the original example, and it works properly following the merge of #240:

  auto fusion = std::make_unique<Fusion>();
  FusionGuard fg(fusion.get());

  int64_t vocab_size = 300;
  int64_t embedding_dim = 96;
  int64_t sentence_len = 256;
  int64_t batch_size = 20;

  auto tv0 = makeSymbolicTensor(2, DataType::Int);
  auto tv1 = makeSymbolicTensor(2);
  fusion->addInput(tv0);
  fusion->addInput(tv1);

  auto tv2 = broadcast(tv0, {false, false, true, true});
  auto tv3 = broadcast(tv1, {true, true, false, false});
  //auto tv4 = torch_gather(tv3, 1, tv2);
  auto tv4 = take_along_axis(tv3, tv2, 2);
  auto tv5 = squeeze(tv4, std::vector<bool>{false, false, true, false});
  fusion->addOutput(tv5);

  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
  at::manual_seed(0);

  auto terms = at::randint(0, vocab_size, {batch_size, sentence_len}, options.dtype(at::kLong));
  auto embedding_table = at::randn({vocab_size, embedding_dim}, options);
  std::vector<c10::IValue> aten_inputs({terms, embedding_table});

  FusionExecutorCache fec(std::move(fusion));
  auto cg_outputs = fec.runFusionWithInputs(aten_inputs);

  auto ref = at::embedding(embedding_table, terms);

  TORCH_CHECK(ref.equal(cg_outputs[0]));
jacobhinkle commented 1 year ago

Note in the above code that swapping in torch_gather instead of take_along_axis results in the following error:

C++ exception with description "root_ind != nullptr INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/index_compute.cpp":1983, please report a bug to PyTorch. Couldn't find root mapping for T3_g[ bS8{1}, bS9{1}, iS80{T3.size[2]}, iS81{T3.size[3]} ] dim: 2 id: iS80{T3.size[2]} Exception raised from getProducerRootIndices at /opt/pytorch/nvfuser/csrc/index_compute.cpp:1983

jacobhinkle commented 1 year ago

I have left this open for now since the segfault is replaced by that failed assertion. However, we could also close it as the workaround is to just use take_along_axis unless we know we might need to shrink some of the non-index axes.