testValidate uses half tolerances for fp32 accumulation.

This can lead to false negatives because the threshold is overly relaxed.

diff --git a/tests/cpp/test_gpu_fused_reduction.cpp b/tests/cpp/test_gpu_fused_reduction.cpp
index e67875f4..b3923d64 100644
--- a/tests/cpp/test_gpu_fused_reduction.cpp
+++ b/tests/cpp/test_gpu_fused_reduction.cpp
@@ -2582,4 +2582,23 @@ TEST_F(NVFuserTest, FusionTensorRankLimit) {
       executor_cache.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
 }

+TEST_F(NVFuserTest, Tolerance) {
+  auto fusion = std::make_unique<Fusion>();
+  FusionGuard fg(fusion.get());
+
+  TensorView* in = makeContigTensor(1, DataType::Half);
+  TensorView* out = castOp(DataType::Float, in);
+  out = sum(out, {0});
+  out = castOp(DataType::Half, out);
+  fusion->addInput(in);
+  fusion->addOutput(out);
+
+  auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
+  at::Tensor in_tensor = at::randn({2048}, options);
+
+  FusionExecutorCache fec(std::move(fusion));
+  at::Tensor out_tensor = fec.runFusionWithInputs({in_tensor})[0];
+  testValidate(fec.fusion(), {out_tensor}, {in_tensor}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser
diff --git a/tests/cpp/validator.cpp b/tests/cpp/validator.cpp
index 6296a0da..18d5d808 100644
--- a/tests/cpp/validator.cpp
+++ b/tests/cpp/validator.cpp
@@ -102,6 +102,8 @@ void testValidate(
         " vs ",
         fusion_output_tensor.sizes());

+    std::cerr << "dtype = " << out_tv->getDataType().value() << std::endl;
+    std::cerr << "reduction_size = " << reduction_size << std::endl;
     auto tolerance_values =
         getTolerance(out_tv->getDataType().value(), reduction_size, tolerances);

NVIDIA / Fuser

testValidate uses half tolerances for fp32 accumulation. #2904