[Bug] inconsistent ouputs produced by the same model

ntcmp2u commented 1 year ago

I use TVM to optimize an ONNX model but the output of the optmized model is inconsistent.

The poc code as the follow:

import tvm
import onnx
from tvm import relay
import numpy as np
from numpy import testing
import pickle

model_input = np.array([[6.71610641,3.85466981,6.40658808,5.93880939,4.22816277,4.05935574,4.05884647,6.85629654,4.35045576,4.98825026,4.79773617,5.90505457,4.07920122,3.91599631,5.48346233,4.68261051,5.96164513,4.04757214,6.40736198,5.78212643,3.97077084,5.38297272,5.98323917,3.33484483,5.0777626],[5.78360128,3.50137734,6.22678471,6.58975887,5.86054516,6.28650713,4.02936316,6.55729437,4.83005142,5.48753119,6.6756916,3.1908493,5.902318,3.63500333,6.6633153,4.03696537,4.59259987,5.29771805,5.47530413,6.98861551,5.21775055,6.27956057,6.06063652,6.99890566,6.74760342],[5.15759087,5.79957485,3.87717772,3.98443675,4.08592176,3.68012309,3.56660628,6.5352087,6.32946777,5.21131229,4.83698702,3.24459314,5.40390682,3.10241055,6.76744366,4.29461193,5.75841236,5.95449638,3.74979687,4.88354635,6.29391479,5.12002468,4.30822229,5.12105751,3.2507937],[3.6196816,4.79808092,4.01847553,3.04140043,6.1275444,6.30466843,3.18326187,4.40191364,6.87912273,5.95551109,6.82588673,5.95711136,5.95687199,6.31480217,3.16748881,4.01974392,4.79580688,4.65139341,4.1572628,6.83029556,6.77638483,4.56482315,6.59405708,6.15137196,3.67091894],[6.05837917,6.34490871,6.82791471,6.16849041,3.10806918,5.42449474,5.17070627,3.94981074,4.28552246,4.36995983,6.1662178,4.59585524,3.32610178,6.27614117,4.7198143,4.82491684,5.11868238,5.589221,5.05218983,4.47221899,6.56419563,3.33831954,4.76141453,5.47758198,6.76508904],[5.27446032,4.80305195,6.86809921,5.71008205,4.43950748,4.55266809,6.0204134,4.15397835,5.40533447,4.21241283,5.2563076,5.31718588,5.03535795,5.61827183,5.727705,5.56590462,3.99580646,3.07781434,6.91844368,4.20229912,6.46835232,6.02725315,6.19255781,5.68293953,3.91823912],[5.54070759,5.55740356,4.43334723,5.79145813,4.09821415,4.99966526,6.07888222,5.35138416,5.53214931,5.38911247,6.39173222,3.72211218,4.38897705,6.03391933,5.74040985,5.49542809,6.62584782,4.33855057,6.8596487,6.41575336,6.35653496,3.90258241,6.14905071,4.76633644,5.97037888],[6.64982224,6.40295553,4.78704929,3.03489995,5.26092148,3.17754674,3.61741209,5.90558434,3.44067335,4.10587311,5.2109766,4.80613804,6.83341599,5.98525429,6.70596409,4.64003181,4.31005573,3.14922762,6.44953346,6.32083702,4.5173068,4.73698139,5.2820859,5.57041645,3.72178435],[6.80869818,3.77339244,3.81124735,5.25667477,6.97375488,5.42400169,3.45591283,4.84353495,3.8287816,5.41924429,3.18944407,4.95114517,3.18213964,3.20189476,5.98147392,4.49553967,6.49250174,5.02359581,5.37200642,4.57207108,5.9689579,4.29011774,6.34126377,4.39409637,5.17850113],[4.48860836,3.66642022,4.86899662,3.43860793,6.94423056,4.19769287,3.77650118,4.41444111,6.10652637,6.75893021,6.0855093,5.83897305,3.97421741,4.10853386,3.62891579,5.17287302,6.16482878,5.45052338,5.91892958,3.24301076,6.47304344,5.24120235,6.28845692,3.56531596,5.53275776],[3.08167267,5.02443886,4.67914724,4.21279573,6.79280615,6.10375261,6.02019453,6.13239574,6.71213055,4.50908089,4.03038025,6.02951813,5.82834053,6.15195274,3.17172861,5.51577044,3.96025443,5.51473904,6.00128365,6.34118938,4.94945621,3.03591824,5.76955414,6.09682941,5.2738781],[4.9579711,4.0002923,3.25455737,3.34851503,6.37354565,6.01042414,4.52113342,5.26745939,6.26920605,6.16422987,3.1255343,6.43074322,3.16986299,4.44038916,4.80228901,3.59629822,4.95137262,6.40821123,4.88515186,4.09138393,5.46744061,5.57049751,4.30876446,6.48064327,4.49197578],[5.54204559,3.35072994,5.81306267,5.0076189,6.09887314,4.83277607,3.76282811,5.04093981,6.85498047,6.33627892,4.55758762,6.29734421,3.90469551,6.35388994,4.73173046,5.31449032,3.06976533,3.54300165,3.2381289,3.11252737,3.15451765,6.34363556,6.97934818,6.39008713,6.97971106]])

onnx_model_0 = onnx.load('./model_0.onnx')
input_dict_0 = {"v5_0": model_input}
shape_dict_0 = {key: val.shape for key, val in input_dict_0.items()}
mod_0, params_0 = relay.frontend.from_onnx(onnx_model_0, shape_dict_0, freeze_params=True)
with tvm.transform.PassContext(opt_level=4):
    executor_0 = relay.build_module.create_executor("graph", mod_0, tvm.cpu(), tvm.target.Target("llvm"), params_0).evaluate()
    output_0 = executor_0(**input_dict_0)

with tvm.transform.PassContext(opt_level=4):
    executor_0 = relay.build_module.create_executor("graph", mod_0, tvm.cpu(), tvm.target.Target("llvm"), params_0).evaluate()
    output_1 = executor_0(**input_dict_0)

print("============")
try:
    testing.assert_allclose(output_0.numpy(), output_1.numpy(), equal_nan=True)
except BaseException as e:
    print("tvm triggers the assertion failure")
    print(e)
print("============")

import onnxruntime as ort

sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_0 = ort.InferenceSession('./model_0.onnx', providers=['CPUExecutionProvider'], sess_options=sess_options)
res_0 = sess_0.run(["v4_0"], input_dict_0)

session_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_1 = ort.InferenceSession('./model_0.onnx', providers=['CPUExecutionProvider'], sess_options=sess_options)
res_1 = sess_1.run(["v4_0"], input_dict_0)

print("============")
testing.assert_allclose(res_0, res_1, equal_nan=True)
print("onnxruntime did not trigger the assertion failure")
print("============")

The model file and the test code are attached at report_bug.zip.

Expected behavior

Obviously, output_0 and output_1 should be identical because they are the ouputs of the same model and the same inputs.

Actual behavior

output_0 and output_1 are different and they appear to be unstable values during my several trials. In contrast, onnxruntime performs consistently each run. The stdout of my several trials are as follows:

============
tvm triggers the assertion failure

Not equal to tolerance rtol=1e-07, atol=0

Mismatched elements: 21 / 25 (84%)
Max absolute difference: 140577680982095
Max relative difference: 1.40577681e+14
 x: array([[              1,  93883951470360,    137438953472,
                     49,  93883950590848,  93883950359888,
                      0,  93883951470408,     17179869184,...
 y: array([[              1,  93883951603280, 140577771430160,
                      2,  93883947480464,               1,
         93883690123272,  93883950380544,               1,...
============
============
onnxruntime did not trigger the assertion failure
============

============
tvm triggers the assertion failure

Not equal to tolerance rtol=1e-07, atol=0

Mismatched elements: 19 / 25 (76%)
Max absolute difference: 8313387590019572390
Max relative difference: 4.2949673e+09
 x: array([[             1, 93921710608472,   137438953472,          73728,
                    96,             64, 93933299986265, 93921710608520,
           17179869184,          81920, 93921710368576, 93921710761760,...
 y: array([[                  1,      93921710281360,                  32,
                         48,      93933293032941,                  12,
             93922344828927, 8313481511730180910,                  80,...
============
============
onnxruntime did not trigger the assertion failure
============

============
tvm triggers the assertion failure

Not equal to tolerance rtol=1e-07, atol=0

Mismatched elements: 23 / 25 (92%)
Max absolute difference: 15022847525276686
Max relative difference: 4.23258643e+12
 x: array([[              1,  94888918231384,    137438953472,
                    240, 139675352272960,    236223201563,
                     55,  94888918231432,     17179869184,...
 y: array([[                1,    94888917322064,   139675377593892,
           94888918299624,       17179869184,    94888917338808,
           94888712470529,               145,    94888918934400,...
============
============
onnxruntime did not trigger the assertion failure
============

It worths to note that when setting opt_level to 0, the tvm perform consistently everytime. So I assume that this is introduced by the tvm optimization.

Environment

OS: Ubuntu 22.04 LTS (Linux jin-pc 6.2.0-26-generic #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2 x86_64 x86_64 x86_64 GNU/Linux) TVM: 0.14.dev226

Steps to reproduce

unzip the attached zip file and use python3 test.py to execute the PoC.

Triage

needs-triage

ntcmp2u commented 1 year ago

Hi, will anyone take care of this bug?

CookedMelon commented 1 year ago

Hi, How your model_0.onnx file was generated?

ntcmp2u commented 1 year ago

Hi @CookedMelon , I generated the model with a custom fuzzer. Bellow I provide a code snippet which does not depend on external model files and illustrate how the model is like. I think it will help developers locate which operator they optimize in a wrong way. Are you a developer of TVM? Will this bug be fixed?

import onnxruntime as ort
import onnx
import numpy as np
import pickle
from numpy import testing
import tvm
from tvm import relay
import torch

class Model0(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, *args):
        _args = args
        getitem = _args[0]
        tril = getitem.tril(0)
        div = torch.div(tril, tril)
        to = div.to(dtype = torch.int64)
        getitem_1 = to[(slice(-13, -12, 1), slice(None, None, None))]
        expand = getitem_1.expand(1, 25)
        return (expand,)

model_0 = Model0()
output_names_0 = ['v4_0']
input_dict_0 = pickle.load(open('./0.pickle', 'rb'))
inputs_0 = tuple(torch.from_numpy(v).to('cpu') for _, v in input_dict_0.items())
torch.onnx.export(model_0, inputs_0, '0.onnx', verbose=False, input_names=['v5_0'], output_names=output_names_0, opset_version=14, do_constant_folding=False)

onnx_model_0 = onnx.load('0.onnx')
onnx_model_outputs_0 = [node.name for node in onnx_model_0.graph.output]
shape_dict_0 = {key: val.shape for key, val in input_dict_0.items()}
mod_0, params_0 = relay.frontend.from_onnx(onnx_model_0, shape_dict_0, freeze_params=True)

def func():
    with tvm.transform.PassContext(opt_level=4):
        executor_0 = relay.build_module.create_executor("graph", mod_0, tvm.cpu(), tvm.target.Target("llvm"), params_0).evaluate()
        executor_res_0 = [executor_0(**input_dict_0).numpy()]
        output_0 = dict(zip(onnx_model_outputs_0, executor_res_0))
        return output_0

output_0 = func()
output_1 = func()

print('=========================')
try:
    for tensor_name in output_names_0:
        testing.assert_allclose(output_0[tensor_name], output_1[tensor_name])
    print("no problem")
except AssertionError as e:
    print("assertion failure for inconsistency")
    print(e)
print('=========================')

CookedMelon commented 1 year ago

I'm not a TVM developer. But I did find that some of the deep learning framework's APIs did not work correctly in TVM. I've posted issue about this phenomenon before. issue16016: compile tensorflow with tvm but get unexpected return value. In fact, I've found that there are a lot of APIs out there that do this

ntcmp2u commented 1 year ago

Yep. TVM still need more testing since it has some unstable (or incorrect) optimizations/transformations for models given by users.

BTW, I've raised this issue almost a month ago, it seems like the TVM team is understaffed, so they don't have time to handle it.

apache / tvm