microsoft / onnxruntime

ONNX Runtime: cross-platform, high performance ML inferencing and training accelerator
https://onnxruntime.ai
MIT License
13.66k stars 2.78k forks source link

Inference result different between cuda and cpu #21220

Open CapJunkrat opened 3 weeks ago

CapJunkrat commented 3 weeks ago

Describe the issue

I tried to use CPUExecutionProvider and CUDAExecutionProvider to inference the same single conv node, and turns out the result does not match after 4 decimals. I'm wondering if this is expected and normal.

To reproduce

To generate this model:

import numpy as np
import onnx
from onnx import helper, numpy_helper

# Define convolution parameters
input_name = "input"  # Assuming the input tensor name
output_name = "output"  # Assuming the output tensor name
input_shape = (
    1,
    256,
    14,
    14,
)  # Example input shape: (batch_size, channels, height, width)

# Convolution node attributes
conv_attributes = {
    "dilations": [1, 1],
    "group": 1,
    "kernel_shape": [3, 3],
    "pads": [1, 1, 1, 1],
    "strides": [1, 1],
}

# Create the input tensor
input_tensor = helper.make_tensor_value_info(
    input_name, onnx.TensorProto.FLOAT, input_shape
)

# Create the convolution output tensor
output_tensor = helper.make_tensor_value_info(
    output_name, onnx.TensorProto.FLOAT, input_shape
)

# Create Conv node
conv_node = helper.make_node(
    "Conv",
    inputs=[input_name, "weight", "bias"],
    outputs=[output_name],
    **conv_attributes,
)

# Initializer for weights
weight_shape = (
    256,
    256,
    3,
    3,
)
weight_data = np.ones(weight_shape).astype(np.float32) * (0.1)
weight_initializer = numpy_helper.from_array(weight_data, name="weight")

# Initializer for bias
bias_shape = 256
bias_data = np.ones(bias_shape).astype(np.float32) * (0.05)
bias_initializer = numpy_helper.from_array(bias_data, name="bias")

# Create the graph with the Conv node and initializers
graph_def = helper.make_graph(
    nodes=[conv_node],
    name="convolution_model",
    inputs=[input_tensor],
    outputs=[output_tensor],
    initializer=[weight_initializer, bias_initializer],
)

# Create the ONNX model
model_def = helper.make_model(graph_def, opset_imports=[helper.make_opsetid("", 11)])

# Save the model
onnx.save(model_def, "conv_model.onnx")

To inference and compare:

import numpy as np

import onnxruntime as ort

def assert_almost_equal(dict_or_list1, dict_or_list2, decimal=5, compare_model=""):
    assert len(dict_or_list1) == len(dict_or_list2)
    err_msg = "The consistency check between " + compare_model + " failed!"
    if isinstance(dict_or_list1, dict):
        for name in dict_or_list1.keys():
            np.testing.assert_almost_equal(
                dict_or_list1[name], dict_or_list2[name], decimal, err_msg
            )
    elif isinstance(dict_or_list1, list):
        for i in range(len(dict_or_list1)):
            np.testing.assert_almost_equal(
                dict_or_list1[i], dict_or_list2[i], decimal, err_msg
            )
    else:
        raise ValueError(
            f"unspport parameter type received in assert_almost_equal, dict or list expected but {type(dict_or_list1)} received"
        )

# Load ONNX model
onnx_model_path = "conv_model.onnx"
ort_session = ort.InferenceSession(onnx_model_path)

# Load inference data
inference_data = np.ones((1, 256, 14, 14)).astype(np.float32)

# Perform inference
input_name = ort_session.get_inputs()[0].name
output_name = ort_session.get_outputs()[0].name

ort_session.set_providers(["CUDAExecutionProvider"])
cuda_result = ort_session.run([output_name], {input_name: inference_data})

ort_session.set_providers(["CPUExecutionProvider"])
cpu_result = ort_session.run([output_name], {input_name: inference_data})

assert_almost_equal(cpu_result, cuda_result)
print("single op test pass")

Urgency

No response

Platform

Linux

OS Version

centos 7 & ubuntu 22.04

ONNX Runtime Installation

Released Package

ONNX Runtime Version or Commit ID

1.16.3

ONNX Runtime API

Python

Architecture

X64

Execution Provider

Default CPU, CUDA

Execution Provider Library Version

cuda 11.8

xadupre commented 3 weeks ago

Results can't be the same because of the parallelism. The same operations are done in cuda and cpu but not necessarily in the same order. It should be much closer for very small tensors because there is no parallelism but it could be still in a slightly different order.