CUDA NVRTC compile error in `fd.ops.add`

Repro:
import torch
from nvfuser import FusionDefinition, DataType

def nvfuser_fusion_id0(fd : FusionDefinition) -> None :
    T0 = fd.define_tensor(shape=[-1, -1, -1, -1, -1], contiguity=[True, True, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[4, 3, 2, 1, 0])
    T1 = fd.define_tensor(shape=[-1, -1, -1, -1], contiguity=[True, True, True, True], dtype=DataType.BFloat16, is_cpu=False, stride_order=[3, 2, 1, 0])
    T2 = fd.ops.permute(T0, dims=[0, 1, 3, 2, 4])
    T3 = fd.ops.stride_order(T2, stride_order=[4, 3, 2, 1, 0])
    V4 = fd.ops.shape(T1)
    T5 = fd.ops.reshape(T3, new_shape=V4)
    T6 = fd.ops.sum(T5, dims=[1], keepdim=False, dtype=DataType.Null)
    T7 = fd.ops.sum(T1, dims=[1], keepdim=False, dtype=DataType.Null)
    T8 = fd.ops.add(T6, T7)
    fd.add_output(T8)

with FusionDefinition() as fd:
    nvfuser_fusion_id0(fd)

inputs = [
    torch.randn((104857600,), dtype=torch.bfloat16, device='cuda:0').as_strided((16, 32, 25, 128, 64), (6553600, 204800, 8192, 64, 1)),
    torch.randn((104857600,), dtype=torch.bfloat16, device='cuda:0').as_strided((16, 32, 128, 1600), (6553600, 204800, 1600, 1)),
]
fd.execute(inputs)

Traceback (most recent call last):
  File "/opt/pytorch/nvfuser/nvfuser/__init__.py", line 139, in execute
    result = self._execute(
RuntimeError: false INTERNAL ASSERT FAILED at "/opt/pytorch/nvfuser/csrc/executor_utils.cpp":829, please report a bug with repro script to NVFuser at https://github.com/NVIDIA/Fuser/issues. 

CUDA NVRTC compile error: __tmp_kernel_reduction_f0_c1_r0_g0.cu(10546): error: no operator "+" matches these operands
            operand types are: <unnamed>::__bfloat + <unnamed>::__bfloat
              + T11[(i9 + (4LL * i14))];
The error does not occur if the dtype is specified in fd.ops.sum.
NVIDIA / Fuser

CUDA NVRTC compile error in `fd.ops.add` #2387