Closed Priya2698 closed 4 months ago
Repro:
import torch from nvfuser import FusionDefinition, DataType def nvfuser_fusion_id0(fd : FusionDefinition) -> None : T0 = fd.define_tensor(shape=[-1, -1, -1, -1, -1], contiguity=[True, True, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[4, 3, 2, 1, 0]) T1 = fd.define_tensor(shape=[-1, -1, -1, -1], contiguity=[True, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0]) T2 = fd.ops.permute(T0, dims=[0, 1, 3, 2, 4]) T3 = fd.ops.stride_order(T2, stride_order=[4, 3, 2, 1, 0]) V4 = fd.ops.shape(T1) T5 = fd.ops.reshape(T3, new_shape=V4) T6 = fd.ops.sum(T5, dims=[1], keepdim=False, dtype=DataType.Null) T7 = fd.ops.sum(T1, dims=[1], keepdim=False, dtype=DataType.Null) T8 = fd.ops.add(T6, T7) fd.add_output(T8) with FusionDefinition() as fd: nvfuser_fusion_id0(fd) inputs = [ torch.randn((104857600,), dtype=torch.bfloat16, device='cuda:0').as_strided((16, 32, 25, 128, 64), (6553600, 204800, 8192, 64, 1)), torch.randn((104857600,), dtype=torch.bfloat16, device='cuda:0').as_strided((16, 32, 128, 1600), (6553600, 204800, 1600, 1)), ] fd.execute(inputs) Traceback (most recent call last): File "/opt/pytorch/nvfuser/nvfuser/__init__.py", line 139, in execute result = self._execute( RuntimeError: false INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/executor_utils.cpp":829, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. CUDA NVRTC compile error: __tmp_kernel_reduction_f0_c1_r0_g0.cu(10546): error: no operator "+" matches these operands operand types are: <unnamed>::__bfloat + <unnamed>::__bfloat + T11[(i9 + (4LL * i14))];
The error does not occur if the dtype is specified in fd.ops.sum.
fd.ops.sum
@kevinstephano mentioned we need explicit upcasting at this time, so this behavior is expected.
Repro:
The error does not occur if the dtype is specified in
fd.ops.sum
.