onnxconverter_common.auto_mixed_precision.auto_convert_mixed_precision never ends

Hey guys,

I hope you are doing great. onnxconverter_common.auto_mixed_precision.auto_convert_mixed_precision takes forever, I let it run for 15m and is still more or less half way. Any idea why? My code:

import onnxruntime as ort
import torch
from torchvision.models import ConvNeXt_Small_Weights, convnext_small
from onnxconverter_common.auto_mixed_precision import auto_convert_mixed_precision
import onnx

model_name = "model.onnx"
model = convnext_small(ConvNeXt_Small_Weights.IMAGENET1K_V1).eval().cuda()

x = torch.randn(1, 3, 224, 224, device="cuda")
# # Export the model
torch.onnx.export(
    model,  # model being run
    x,  # model input (or a tuple for multiple inputs)
    model_name,  # where to save the model (can be a file or file-like object)
    opset_version=16,
    export_params=True,  # store the trained parameter weights inside the model file
    do_constant_folding=True,  # whether to execute constant folding for optimization
    input_names=["image"],  # the model's input names
    output_names=["output"],  # the model's output names
    dynamic_axes={
        "image": {0: "batch_size"},  # variable length axes
        "output": {0: "batch_size"},
    },
)

model = onnx.load("model.onnx")
model_fp16 = auto_convert_mixed_precision(model, { "image" : x.cpu().numpy() }, None, rtol=0.01, atol=0.001, keep_io_types=True)
onnx.save(model_fp16, "model_fp16.onnx")
# let's check
print("Checking")
x = torch.randn(1, 3, 224, 224, device="cuda", dtype=torch.float16)
ort_session = ort.InferenceSession("model_fp16.onnx", providers=["CUDAExecutionProvider"])
outputs = ort_session.run(None, {"image": x.cpu().numpy()})
print(outputs[0].shape, outputs[0].dtype)

Thanks a lot :)

microsoft / onnxconverter-common

onnxconverter_common.auto_mixed_precision.auto_convert_mixed_precision never ends #251