Open xiaowuhu opened 1 year ago
The mismatch is from fp16_tensor fp32_constant. In the onnx graph, it uses fp16_tensor Cast(fp32_constant, to=float16). In numpy, the computation is in fp32 then output fp16.
The same happens when we provide the constant as float16 @tianleiwu:
import torch
import onnxruntime as ort
import numpy as np
input = {
"input_0": np.array(
[
[7.99, -3.93, -8.35, -5.188, -8.1],
[3.7, 6.18, -2.293, -2.523, -1.925],
[2.68, -8.15, 7.46, -1.995, 2.936],
[-1.459, -5.188, -5.08, 8.73, 2.7],
[-6.82, -7.55, 4.22, -0.3604, 2.936],
],
dtype=np.float16,
),
"input_1": np.array(
[
[-0.04395, -4.246, -2.338, 0.923, 4.938],
[-8.3, -7.84, -2.004, -1.099, -7.797],
[-7.39, 3.516, 2.89, -2.11, 4.457],
[7.48, -0.3604, -8.41, -4.21, 6.793],
[-8.55, 3.945, -7.207, -7.902, 6.555],
],
dtype=np.float16,
),
}
# use onnxscript model
model_path = "add.onnx"
sess = ort.InferenceSession(model_path)
output1 = sess.run(None, input)
np.set_printoptions(formatter={"float": "{: 0.6f}".format})
print(output1)
print("------------")
# use torch.ops.aten.add
# output = self + other * alpha
output_torch = torch.ops.aten.add(
torch.tensor(input["input_0"]), torch.tensor(input["input_1"]), alpha=-3.125
)
output2 = output_torch.numpy()
print(output2)
print("------------")
# use numpy
output3 = input["input_0"] + input["input_1"] * np.array(-3.125, dtype=np.float16)
print(output3)
print("------------")
print(np.allclose(output1, output2, rtol=1e-03, atol=1e-05))
print(np.allclose(output1, output3, rtol=1e-03, atol=1e-05))
print(np.allclose(output2, output3, rtol=1e-03, atol=1e-05))
@justinchuby, you can see that sess = ort.InferenceSession(model_path, providers=['CUDAExecutionProvider'])
works fine.
[array([[ 8.125000, 9.335938, -1.046875, -8.070312, -23.531250],
[ 29.625000, 30.687500, 3.968750, 0.910156, 22.437500],
[ 25.781250, -19.125000, -1.570312, 4.597656, -10.992188],
[-24.828125, -4.062500, 21.187500, 21.875000, -18.531250],
[ 19.875000, -19.875000, 26.734375, 24.328125, -17.546875]],
dtype=float16)]
------------
[[ 8.125000 9.335938 -1.046875 -8.070312 -23.531250]
[ 29.625000 30.687500 3.968750 0.910156 22.437500]
[ 25.781250 -19.125000 -1.570312 4.597656 -10.992188]
[-24.828125 -4.062500 21.187500 21.875000 -18.531250]
[ 19.875000 -19.875000 26.734375 24.328125 -17.546875]]
------------
[[ 8.125000 9.335938 -1.046875 -8.070312 -23.531250]
[ 29.625000 30.687500 3.968750 0.910156 22.437500]
[ 25.781250 -19.125000 -1.570312 4.597656 -10.992188]
[-24.828125 -4.062500 21.187500 21.875000 -18.531250]
[ 19.875000 -19.875000 26.734375 24.328125 -17.546875]]
------------
True
True
True
If you use CPU EP:
[array([[ 8.125000, 9.335938, -1.045898, -8.070312, -23.531250],
[ 29.625000, 30.671875, 3.968750, 0.909668, 22.437500],
[ 25.781250, -19.140625, -1.572266, 4.597656, -10.992188],
[-24.828125, -4.062500, 21.187500, 21.890625, -18.531250],
[ 19.890625, -19.875000, 26.734375, 24.328125, -17.546875]],
dtype=float16)]
------------
[[ 8.125000 9.335938 -1.046875 -8.070312 -23.531250]
[ 29.625000 30.687500 3.968750 0.910156 22.437500]
[ 25.781250 -19.125000 -1.570312 4.597656 -10.992188]
[-24.828125 -4.062500 21.187500 21.875000 -18.531250]
[ 19.875000 -19.875000 26.734375 24.328125 -17.546875]]
------------
[[ 8.125000 9.335938 -1.046875 -8.070312 -23.531250]
[ 29.625000 30.687500 3.968750 0.910156 22.437500]
[ 25.781250 -19.125000 -1.570312 4.597656 -10.992188]
[-24.828125 -4.062500 21.187500 21.875000 -18.531250]
[ 19.875000 -19.875000 26.734375 24.328125 -17.546875]]
------------
False
False
True
The reason is that, CPU does not have FP16 version of Mul and Add, so it adds Cast to compute Mul and Add in FP32. It is obvious that onnx script has not considered this case since the alpha will use FP32 value in CPU EP, but alpha is FP16 in CUDA EP.
I see. Thank you!
Posting a comparison script including gpu results
import torch
import onnxruntime as ort
import numpy as np
import onnxscript
import onnx
input = {
"input_0": np.array(
[
[7.99, -3.93, -8.35, -5.188, -8.1],
[3.7, 6.18, -2.293, -2.523, -1.925],
[2.68, -8.15, 7.46, -1.995, 2.936],
[-1.459, -5.188, -5.08, 8.73, 2.7],
[-6.82, -7.55, 4.22, -0.3604, 2.936],
],
dtype=np.float16,
),
"input_1": np.array(
[
[-0.04395, -4.246, -2.338, 0.923, 4.938],
[-8.3, -7.84, -2.004, -1.099, -7.797],
[-7.39, 3.516, 2.89, -2.11, 4.457],
[7.48, -0.3604, -8.41, -4.21, 6.793],
[-8.55, 3.945, -7.207, -7.902, 6.555],
],
dtype=np.float16,
),
}
# use onnxscript model
model_path = "add.onnx"
print(onnxscript.proto2text(onnx.load(model_path)))
sess = ort.InferenceSession(model_path, providers=["CUDAExecutionProvider"])
output_ort_cuda = sess.run(None, input)
sess = ort.InferenceSession(model_path, providers=["CPUExecutionProvider"])
output_ort_cpu = sess.run(None, input)
np.set_printoptions(formatter={"float": "{: 0.6f}".format})
print("ort_cpu")
print(output_ort_cpu)
print("ort_cuda")
print(output_ort_cuda)
print("------------")
# use torch.ops.aten.add
# output = self + other * alpha
output_torch = torch.ops.aten.add(
torch.tensor(input["input_0"]), torch.tensor(input["input_1"]), alpha=-3.125
)
output_pt_cpu = output_torch.cpu().numpy()
print("pt_cpu")
print(output_pt_cpu)
output_torch = torch.ops.aten.add(
torch.tensor(input["input_0"]).to(torch.device("cuda")), torch.tensor(input["input_1"]).to(torch.device("cuda")), alpha=-3.125
)
output_pt_cuda = output_torch.cpu().numpy()
print("pt_cuda")
print(output_pt_cuda)
print("------------")
# use numpy
output_np = input["input_0"] + input["input_1"] * np.array(-3.125, dtype=np.float16)
np.add(1, 2)
print("np")
print(output_np)
print("------------")
def comp(a, b, a_name, b_name):
print(f"{a_name} vs {b_name}, {np.allclose(a, b, rtol=1e-03, atol=1e-05)}")
comp(output_ort_cpu, output_ort_cuda, "ort_cpu", "ort_cuda")
comp(output_pt_cpu, output_pt_cuda, "pt_cpu", "pt_cuda")
comp(output_ort_cpu, output_pt_cuda, "ort_cpu", "pt_cuda")
comp(output_ort_cuda, output_pt_cpu, "ort_cuda", "pt_cpu")
comp(output_ort_cpu, output_np, "ort_cpu", "np")
comp(output_ort_cuda, output_np, "ort_cuda", "np")
comp(output_pt_cpu, output_np, "pt_cpu", "np")
comp(output_pt_cuda, output_np, "pt_cuda", "np")
Results
...
------------
ort_cpu vs ort_cuda, False
pt_cpu vs pt_cuda, False
ort_cpu vs pt_cuda, True
ort_cuda vs pt_cpu, True
ort_cpu vs np, False
ort_cuda vs np, True
pt_cpu vs np, True
pt_cuda vs np, False
It is interesting that ort_cuda == pt_cpu == np
, ort_cpu == pt_cuda
and ort_cpu != np
@tianleiwu do you have insight? We are fairly sure that both pt_cpu
and np
computes fp16 arithmetic in fp32 with cast back. Are you suggesting that ORT CPU somehow is able to compute fp16 directly, but on the other hand applies the upcasting logic for CUDA EP?
import torch
def func(a, b, alpha):
return a + b * alpha
def native_add(a, b, alpha):
return torch.ops.aten.add(a, b, alpha=alpha)
def call(a, b, alpha, device, fn):
a = a.to(device)
b = b.to(device)
return fn(a, b, alpha)
a = torch.tensor([-8.35], dtype=torch.float16)
b = torch.tensor([-2.338], dtype=torch.float16)
alpha = -3.125
print(call(a, b, alpha, torch.device("cpu"), func))
print(call(a, b, alpha, torch.device("cpu"), native_add))
print(call(a, b, alpha, torch.device("cuda"), func))
print(call(a, b, alpha, torch.device("cuda"), native_add))
Result
tensor([-1.0469], dtype=torch.float16)
tensor([-1.0469], dtype=torch.float16)
tensor([-1.0469], device='cuda:0', dtype=torch.float16)
tensor([-1.0459], device='cuda:0', dtype=torch.float16)
On a side note, @justinchuby for this case torch is consistent between cpu & gpu w.r.t op math type promotion. It is how alpha is being handled internally inside kernel that differs.
Any update on this?
I think torch usually kept scalar in fp32, and internally the kernel use fp32 as accumulate type even though input tensor is fp16. For ONNX, sometime the scalar is defined as same data type as input, so there is data loss in onnx export when fp32 scalar is converted to fp16.
In ORT, the CPU ep will upcast fp16 to fp32 to compute even though input tensor is fp16. Cuda EP will use fp16 in compute if possible, so there is difference.
I think those are by design.
Describe the issue
The attached add.zip file is add.onnx model, this will impact torch2onnx model inference by using fp16 precision. When testing with fp16 input, one sample (location is [2,2]) was failed because the output are not close with torch and numpy. Btw, fp32 is OK, and the model was created by onnxscript, so you can see a function inside.
To reproduce
add.zip
Urgency
No response
Platform
Windows
OS Version
11
ONNX Runtime Installation
Released Package
ONNX Runtime Version or Commit ID
1.14
ONNX Runtime API
Python
Architecture
X64
Execution Provider
Default CPU
Execution Provider Library Version
No response