Closed naoyam closed 1 week ago
https://github.com/NVIDIA/Fuser/issues/871#issuecomment-2462425423
Slightly smaller repro containing only the failing segment, simplified as much as possible, which is scheduled by the Reduction scheduler
# CUDA devices:
# 0: NVIDIA H100 80GB HBM3
# torch version: 2.6.0a0+gitffb7a08
# cuda version: 12.6
# nvfuser version: 0.2.22+git6912435
import torch
from nvfuser import FusionDefinition, DataType
def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
T0 = fd.define_tensor(shape=[28, 32768, 2], contiguity=[True, False, True], dtype=DataType.Float, is_cpu=False)
T1 = fd.define_tensor(shape=[32768, 2], contiguity=[True, True], dtype=DataType.Float, is_cpu=False)
T2 = fd.define_tensor(shape=[28, 32768, 1], contiguity=[True, False, None], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
T3 = fd.define_tensor(shape=[28, 32768, 1], contiguity=[True, False, None], dtype=DataType.Float, is_cpu=False, stride_order=[2, 1, 0])
T7 = fd.ops.pad(T2, [1, 0], None)
T11 = fd.ops.pad(T3, [0, 1], None)
T12 = fd.ops.add(T7, T11)
T13 = fd.ops.broadcast(T1, is_broadcast_dim=[True, False, False])
T14 = fd.ops.mul(T13, T0)
T15 = fd.ops.add(T12, T14)
T16 = fd.ops.permute(T15, dims=[1, 0, 2])
T20 = fd.ops.reshape(T16, new_shape=[32768, 56])
T21 = fd.ops.sum(T20, dims=[0], keepdim=False, dtype=DataType.Float)
fd.add_output(T21)
fd.add_output(T20)
fd.add_output(T13)
with FusionDefinition() as fd:
nvfuser_fusion_id0(fd)
inputs = [
torch.testing.make_tensor((28, 32768, 2), dtype=torch.float32, device='cuda:0'),
torch.testing.make_tensor((32768, 2), dtype=torch.float32, device='cuda:0'),
torch.testing.make_tensor((28, 32768, 2), dtype=torch.float32, device='cuda:0'),
torch.testing.make_tensor((28, 32768, 2), dtype=torch.float32, device='cuda:0'),
]
fd.execute(inputs)
Thank you for fixing the problem!
(Separated out from #871 as I believe it's a separate issue. See https://github.com/NVIDIA/Fuser/issues/871#issuecomment-2461562917)
I encountered this error while trying to run HF's Qwen 2 model with Thunder (https://github.com/Lightning-AI/lightning-thunder/pull/1406). This model is important to support soon. @jacobhinkle or @naoyam do you have an estimate of how much time it could take to fix this bug? Is there any change we could introduce to the fusion definition as a workaround?