Error found in Mistral-Nemo and Qwen2's Rope implementations

kevinstephano commented 4 weeks ago

Error message:

Error from segmentation group 3:  INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/index_compute.cpp":1966, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. Couldn't find allocation mapping for T19_l___bfloat[ iblockIdx.x148{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( 8 * ( 128000 * ( ceilDiv(1024, 8) ) ) ) ), 8) ), 1) ), 128) )}, iUS147{1}, iV145{8}, ithreadIdx.x149{128} ] ca_pos( 2 ) dim: 2 id: iS76{1024}, loops:  iblockIdx.x94{( ceilDiv(( ceilDiv(( ceilDiv(( 1 * ( ( 8 * 4 ) * ( 128000 * 128 ) ) ), 8) ), 1) ), 128) )} iUS93{1} iV145{8} ithreadIdx.x149{128}
Exception raised from getNonGlobalConsumerStridedIndices at /opt/pytorch/nvfuser/csrc/index_compute.cpp:1966 (most recent call first):
frame #0: nvfuser::nvfCheckFail(char const*, char const*, unsigned int, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) + 0xd6 (0x7fcc46af568a in /opt/pytorch/nvfuser/nvfuser/_C.cpython-312-x86_64-linux-gnu.so)

Repro:

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[1, 128000, 1024], contiguity=[None, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[2, 1, 0])
    T1 = fd.define_tensor(shape=[64], contiguity=[True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[0])
    T2 = fd.define_tensor(shape=[1, 128000], contiguity=[None, True], dtype=DataType.Int, is_cpu=False, stride_order=[1, 0])
    T8 = fd.ops.reshape(T0, new_shape=[1, 128000, 8, 128])
    T9 = fd.ops.permute(T8, dims=[0, 2, 1, 3])
    S10 = fd.define_scalar(1, dtype=DataType.Int)
    S11 = fd.define_scalar(64, dtype=DataType.Int)
    S12 = fd.define_scalar(1, dtype=DataType.Int)
    T14 = fd.ops.broadcast_in_dim(T1, shape=[S10, S11, S12], broadcast_dims=[1])
    T15 = fd.ops.cast(T14, dtype=DataType.Float)
    S16 = fd.define_scalar(1, dtype=DataType.Int)
    S17 = fd.define_scalar(64, dtype=DataType.Int)
    S18 = fd.define_scalar(1, dtype=DataType.Int)
    T20 = fd.ops.broadcast_in_dim(T15, shape=[S16, S17, S18], broadcast_dims=[0, 1, 2])
    S21 = fd.define_scalar(1, dtype=DataType.Int)
    S22 = fd.define_scalar(1, dtype=DataType.Int)
    S23 = fd.define_scalar(128000, dtype=DataType.Int)
    T25 = fd.ops.broadcast_in_dim(T2, shape=[S21, S22, S23], broadcast_dims=[0, 2])
    T26 = fd.ops.cast(T25, dtype=DataType.Float)
    S27 = fd.define_scalar(1, dtype=DataType.Int)
    S28 = fd.define_scalar(8, dtype=DataType.Int)
    S29 = fd.define_scalar(1, dtype=DataType.Int)
    S30 = fd.define_scalar(128000, dtype=DataType.Int)
    S31 = fd.define_scalar(128, dtype=DataType.Int)
    T33 = fd.ops.broadcast_in_dim(T9, shape=[S27, S28, S29, S30, S31], broadcast_dims=[0, 1, 3, 4])
    S34 = fd.define_scalar(1, dtype=DataType.Int)
    S35 = fd.define_scalar(8, dtype=DataType.Int)
    S36 = fd.define_scalar(4, dtype=DataType.Int)
    S37 = fd.define_scalar(128000, dtype=DataType.Int)
    S38 = fd.define_scalar(128, dtype=DataType.Int)
    T40 = fd.ops.broadcast_in_dim(T33, shape=[S34, S35, S36, S37, S38], broadcast_dims=[0, 1, 2, 3, 4])
    T46 = fd.ops.reshape(T40, new_shape=[1, 32, 128000, 128])
    fd.add_output(T20)
    fd.add_output(T26)
    fd.add_output(T46)

with FusionDefinition() as fd:
    nvfuser_fusion_id0(fd)

inputs = [
    torch.testing.make_tensor((1, 128000, 1024), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((64,), dtype=torch.bfloat16, device='cuda:0'),
    torch.testing.make_tensor((1, 128000), dtype=torch.int64, device='cuda:0'),
]
fd.execute(inputs)

jacobhinkle commented 3 weeks ago

Slightly smaller repro:

import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[128000, 1024], contiguity=[True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[1, 0])
    T5 = fd.ops.reshape(T0, new_shape=[128000, 8, 128])
    T6 = fd.ops.permute(T5, dims=[1, 0, 2])
    S7 = fd.define_scalar(8, dtype=DataType.Int)
    S8 = fd.define_scalar(4, dtype=DataType.Int)
    S9 = fd.define_scalar(128000, dtype=DataType.Int)
    S10 = fd.define_scalar(128, dtype=DataType.Int)
    T12 = fd.ops.broadcast_in_dim(T6, shape=[S7, S8, S9, S10], broadcast_dims=[0, 2, 3])
    T17 = fd.ops.reshape(T12, new_shape=[32, 128000, 128])
    fd.add_output(T17)

with FusionDefinition() as fd:
    nvfuser_fusion_id0(fd)

inputs = [
    torch.testing.make_tensor((128000, 1024), dtype=torch.bfloat16, device='cuda:0'),
]
fd.execute(inputs)

T12 has an expanded broadcast to size 4. Then we reshape from (8, 4, 128000, 128) to (32, 128000, 128) which just merges that expanded dim in with the size 8 dim.

kevinstephano commented 3 weeks ago

Can we selectively enable the ID-Model for this case?

kevinstephano commented 3 weeks ago

Any progress on this issue?

naoyam commented 3 weeks ago

The first attempt wasn't successful (#3317). Will try a different WAR.

NVIDIA / Fuser

Error found in Mistral-Nemo and Qwen2's Rope implementations #3299