Inference result different between cuda and cpu

Describe the issue

I tried to use CPUExecutionProvider and CUDAExecutionProvider to inference the same single conv node, and turns out the result does not match after 4 decimals. I'm wondering if this is expected and normal.

To reproduce

To generate this model:

import numpy as np
import onnx
from onnx import helper, numpy_helper

# Define convolution parameters
input_name = "input"  # Assuming the input tensor name
output_name = "output"  # Assuming the output tensor name
input_shape = (
    1,
    256,
    14,
    14,
)  # Example input shape: (batch_size, channels, height, width)

# Convolution node attributes
conv_attributes = {
    "dilations": [1, 1],
    "group": 1,
    "kernel_shape": [3, 3],
    "pads": [1, 1, 1, 1],
    "strides": [1, 1],
}

# Create the input tensor
input_tensor = helper.make_tensor_value_info(
    input_name, onnx.TensorProto.FLOAT, input_shape
)

# Create the convolution output tensor
output_tensor = helper.make_tensor_value_info(
    output_name, onnx.TensorProto.FLOAT, input_shape
)

# Create Conv node
conv_node = helper.make_node(
    "Conv",
    inputs=[input_name, "weight", "bias"],
    outputs=[output_name],
    **conv_attributes,
)

# Initializer for weights
weight_shape = (
    256,
    256,
    3,
    3,
)
weight_data = np.ones(weight_shape).astype(np.float32) * (0.1)
weight_initializer = numpy_helper.from_array(weight_data, name="weight")

# Initializer for bias
bias_shape = 256
bias_data = np.ones(bias_shape).astype(np.float32) * (0.05)
bias_initializer = numpy_helper.from_array(bias_data, name="bias")

# Create the graph with the Conv node and initializers
graph_def = helper.make_graph(
    nodes=[conv_node],
    name="convolution_model",
    inputs=[input_tensor],
    outputs=[output_tensor],
    initializer=[weight_initializer, bias_initializer],
)

# Create the ONNX model
model_def = helper.make_model(graph_def, opset_imports=[helper.make_opsetid("", 11)])

# Save the model
onnx.save(model_def, "conv_model.onnx")

To inference and compare:

import numpy as np

import onnxruntime as ort

def assert_almost_equal(dict_or_list1, dict_or_list2, decimal=5, compare_model=""):
    assert len(dict_or_list1) == len(dict_or_list2)
    err_msg = "The consistency check between " + compare_model + " failed!"
    if isinstance(dict_or_list1, dict):
        for name in dict_or_list1.keys():
            np.testing.assert_almost_equal(
                dict_or_list1[name], dict_or_list2[name], decimal, err_msg
            )
    elif isinstance(dict_or_list1, list):
        for i in range(len(dict_or_list1)):
            np.testing.assert_almost_equal(
                dict_or_list1[i], dict_or_list2[i], decimal, err_msg
            )
    else:
        raise ValueError(
            f"unspport parameter type received in assert_almost_equal, dict or list expected but {type(dict_or_list1)} received"
        )

# Load ONNX model
onnx_model_path = "conv_model.onnx"
ort_session = ort.InferenceSession(onnx_model_path)

# Load inference data
inference_data = np.ones((1, 256, 14, 14)).astype(np.float32)

# Perform inference
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

ort_session.set_providers(["CUDAExecutionProvider"])
cuda_result = ort_session.run([output_name], {input_name: inference_data})

ort_session.set_providers(["CPUExecutionProvider"])
cpu_result = ort_session.run([output_name], {input_name: inference_data})

assert_almost_equal(cpu_result, cuda_result)
print("single op test pass")

Urgency

No response

Platform

Linux

OS Version

centos 7 & ubuntu 22.04

ONNX Runtime Installation

Released Package

ONNX Runtime Version or Commit ID

1.16.3

ONNX Runtime API

Python

Architecture

X64

Execution Provider

Default CPU, CUDA

Execution Provider Library Version

cuda 11.8

microsoft / onnxruntime

Inference result different between cuda and cpu #21220