microsoft / onnxruntime

ONNX Runtime: cross-platform, high performance ML inferencing and training accelerator
https://onnxruntime.ai
MIT License
14.25k stars 2.87k forks source link

[Bug] Coqui VITS ONNX model can't be statically quantized. #16738

Open mllopartbsc opened 1 year ago

mllopartbsc commented 1 year ago

Describe the issue

Hi, I was trying to statically quantize this Coqui VITS model that I had previously exported to ONNX. When doing so, I encountered this error:

onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Reshape node. Name:'/duration_predictor/flows.4/Reshape_27_output_0_ReduceMax_Reshape' Status Message: /onnxruntime_src/onnxruntime/core/providers/cpu/tensor/reshape_helper.h:40 onnxruntime::ReshapeHelper::ReshapeHelper(const onnxruntime::TensorShape&, onnxruntime::TensorShapeVector&, bool) gsl::narrow_cast<int64_t>(input_shape.Size()) == size was false. The input tensor cannot be reshaped to the requested shape. Input shape:{0}, requested shape:{1}

Below is the script that I'm using so that you can reproduce the bug. In order to reproduce it, the only thing you need to change are the paths to import the Coqui VITS model, export the ONNX version and then import it back. You can download the mentioned model and its configuration from this link.

# Import necessary libraries
import os
import urllib.request
import time
import tqdm
import numpy
import onnx
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
import onnxruntime as ort
import torch
import random

# TTS imports
from TTS.tts.models.vits import Vits
from TTS.tts.configs.vits_config import VitsConfig
from TTS.utils.audio.numpy_transforms import save_wav

# ONNX Runtime quantization imports
from onnxruntime.quantization.quantize import quantize_dynamic, quantize_static
from onnxruntime.quantization.calibrate import CalibrationDataReader
from onnxruntime.quantization.quant_utils import QuantFormat, QuantType
from onnxruntime.quantization.shape_inference import quant_pre_process

# Load VITS model configuration from JSON file
config = VitsConfig()
config.load_json("/home/mllopart/PycharmProjects/ONNX/models/vits_ca/config.json")

# Initialize VITS model and load its checkpoint
vits = Vits.init_from_config(config)
vits.load_checkpoint(config, "/home/mllopart/PycharmProjects/ONNX/models/vits_ca/model_file.pth")

# Export the VITS model to ONNX format
vits.export_onnx()
vits.load_onnx("coqui_vits.onnx")

# Define the model path
model_name = "/home/mllopart/PycharmProjects/ttsAPI/tts-api/server/coqui_vits.onnx"

# Set execution providers depending on CUDA availability
cuda = False  # Set to True if CUDA is available and you want to use GPU
providers = [
    "CPUExecutionProvider" if cuda is False else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
]

# Create an ONNX Runtime session with the defined options and providers
sess_options = ort.SessionOptions()
onnx_sess = ort.InferenceSession(model_name, sess_options=sess_options, providers=providers)

# Define a text to convert to speech
text = "From the beginning of time, human beings have been driven by an insatiable curiosity to explore the unknown."

# Convert text to input tensor for the model
x = numpy.asarray(
    vits.tokenizer.text_to_ids(text, language="en"),
    dtype=numpy.int64,
)[None, :]

# Variables related to the model's behavior
x_lengths = None
num_chars: int = 100
inference_noise_scale: float = 1.0
length_scale: float = 1
inference_noise_scale_dp: float = 1.0
num_speakers: int = 0

# If x and x_lengths are torch tensors, convert them to numpy
if isinstance(x, torch.Tensor):
    x = x.cpu().numpy()
if x_lengths is None:
    x_lengths = numpy.array([x.shape[1]], dtype=numpy.int64)
if isinstance(x_lengths, torch.Tensor):
    x_lengths = x_lengths.cpu().numpy()

# Prepare scales for inference
scales = numpy.array(
    [inference_noise_scale, length_scale, inference_noise_scale_dp],
    dtype=numpy.float32,
)

# Run inference to generate audio as a test that inputs are correctly set.
audio = onnx_sess.run(
    ["output"],
    {
        "input": x,
        "input_lengths": x_lengths,
        "scales": scales,
    },
)

# Generate 50 random sentences
subjects = ["I", "You", "Bob", "Alice", "The cat", "The robot"]
verbs = ["like", "hate", "see", "touch", "admire", "love"]
objects = ["apples", "the moon", "the rain", "a beautiful painting", "the idea of existence", "the sound of the ocean"]

sentences = []
for i in range(50):
    subject = random.choice(subjects)
    verb = random.choice(verbs)
    object = random.choice(objects)
    sentence = f"{subject} {verb} {object}."
    sentences.append(sentence)

# Prepare inputs for the sentences
x = [numpy.asarray(vits.tokenizer.text_to_ids(sentence, language="en"), dtype=numpy.int64)[None, :] for sentence in sentences]
x_lengths = [numpy.array([len(x_i[0])], dtype=numpy.int64) for x_i in x]
scales = [numpy.array([inference_noise_scale, length_scale, inference_noise_scale_dp], dtype=numpy.float32) for _ in range(50)]

# Run inference for one sentence as another test to see if the inputs are correctly set.

audio = onnx_sess.run(
    ["output"],
    {
        "input": x[1],
        "input_lengths": x_lengths[1],
        "scales": scales[1],
    },
)

# Define a custom data reader class for model quantization
class DataReader(CalibrationDataReader):
    def __init__(self, x, x_lengths, scales):
        self.data1 = x
        self.data2 = x_lengths
        self.data3 = scales
        self.pos = -1

    def get_next(self):
        if self.pos >= len(self.data1) - 1:
            return None
        self.pos += 1
        return {'input': self.data1[self.pos], 'input_lengths': self.data2[self.pos], 'scales': self.data3[self.pos]}

    def rewind(self):
        self.pos = -1

# Define the quantized model name
quantize_name = model_name + ".qdq.onnx"

# Quantize the model to optimize it
quantize_static(model_name, quantize_name, calibration_data_reader=DataReader(x, x_lengths, scales), quant_format=QuantFormat.QDQ)

It seems like the reason behind the error is this line from the calibrate.py script. This is the function containing the line:

    def collect_data(self, data_reader: CalibrationDataReader):
        while True:
            inputs = data_reader.get_next()
            if not inputs:
                break
            self.intermediate_outputs.append(self.infer_session.run(None, inputs))

        if len(self.intermediate_outputs) == 0:
            raise ValueError("No data is collected.")

        self.compute_range()
        self.clear_collected_data()

The error emerges from the self.infer_session.run(None,inputs) part. Here's the definition of the function that creates the session:

    def create_inference_session(self):
        """
        create an OnnxRuntime InferenceSession.
        """
        sess_options = onnxruntime.SessionOptions()
        sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
        self.infer_session = onnxruntime.InferenceSession(
            self.augmented_model_path,
            sess_options=sess_options,
            providers=self.execution_providers,
        )

As you can see, the onnxruntime.Inference session is created using the augmented_model_path, which is defined in line 54 of the calibrate.py script. Since this model isn't the same as the onnx model used for inference, the error appears because the augmented model requires different inputs. However, in the script I've tested if the input data is correct twice, and it is. And, as you will see with a following test, the error disappears when the inferences session uses the VITS ONNX model instead of the augmented model ONNX. Here, I have three questions:

1- Where does the augmented model come from? Is it created as a blan .onnx file to be filled with the original onnx model?

2- Why does it require different inputs as the original onnx model?

3- How could this input problem be fixed?

However, the problems don't end here. To test out if the problem was only with the inferences session of the augmented model, i changed the inference session to use the coqui VITS onnx model instead. Here's the modificaiton I applied:

import os
import urllib.request
import time
import tqdm
import numpy
import onnx
import pandas
import matplotlib.pyplot as plt
from onnxruntime import InferenceSession
import onnxruntime as ort
import torch
import random

from TTS.tts.models.vits import Vits
from TTS.tts.configs.vits_config import VitsConfig
from TTS.utils.audio.numpy_transforms import save_wav

config = VitsConfig()
config.load_json("/home/mllopart/PycharmProjects/ONNX/models/vits_ca/config.json")
vits = Vits.init_from_config(config)
vits.load_onnx("coqui_vits.onnx")

model_name = "/home/mllopart/PycharmProjects/ttsAPI/tts-api/server/coqui_vits.onnx"

cuda = False
providers = [
    "CPUExecutionProvider"
    if cuda is False
    else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})
]
sess_options = ort.SessionOptions()
onnx_sess = ort.InferenceSession(model_name, sess_options=sess_options, providers=providers)

I added this between lines 20 and 23 from the calibrate.py script of the onnxruntime library. Then I modified line 243 from this:

self.intermediate_outputs.append(self.infer_session.run(None, inputs))

To this:

self.intermediate_outputs.append(onnx_sess.run(None, inputs))

So that the VITS model was running the inference instead. With that, the calibrate.py was working fine, but a problem arised with the quantize.py script, in line 406. This is the error:


WARNING:root:failed to infer the type of tensor: . Skip to quantize it. Please check if it is expected.
WARNING:root:failed to infer the type of tensor: . Skip to quantize it. Please check if it is expected.
.
.
.
WARNING:root:failed to infer the type of tensor: . Skip to quantize it. Please check if it is expected.
Traceback (most recent call last):
  File "/home/mllopart/PycharmProjects/ttsAPI/tts-api/server/static_quantization.py", line 142, in <module>
    quantize_static(model_name, quantize_name, calibration_data_reader=DataReader(x, x_lengths, scales), quant_format=QuantFormat.QDQ)
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/quantize.py", line 406, in quantize_static
    quantizer.quantize_model()
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/qdq_quantizer.py", line 217, in quantize_model
    self._quantize_normal_tensors()
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/qdq_quantizer.py", line 385, in _quantize_normal_tensors
    raise ValueError(
ValueError: Quantization parameters are not specified for param /text_encoder/Constant_output_0. In static mode quantization params for inputs and outputs of nodes to be quantized are required.

Which seems to be an error about quantization parameters. I am assuming that when converting the VITS model to ONNX these parameters are not present. However, I am not too sure about the cause of this error.

I would appreciate if someone could provide their insights on these errors.

Kind Regards.

To reproduce

1- Install the required libraries (they all appear as imports in the scripts).

2- Download the VITS model and its configuration with the provided link.

3- Change the import and export paths from the script.

4- Run the script.

5- Modify the calibrate.py script from the onnxruntime library to change the inference session to the VITS ONNX model.

6- Run the script again.

Urgency

This problem is very urgent as it is part of a project where inference times are crucial. The sooner it can be fixed the better.

Platform

Linux

OS Version

Ubuntu 22.04.2 LTS

ONNX Runtime Installation

Released Package

ONNX Runtime Version or Commit ID

ONNX Runtime v1.15.1

ONNX Runtime API

Python

Architecture

X64

Execution Provider

Default CPU

Execution Provider Library Version

No response

yufenglee commented 1 year ago

Looks like tensor in the model with name "/duration_predictor/flows.4/Reshape_27_output_0" has size 0. I think it may be because you are using random generated calibration data. For quantization, you need to use carefully selected real data set to get good accuracy.

And could you please share the augmented model?

mllopartbsc commented 1 year ago

Hi @yufenglee,

Thank you for the prompt response. The Coqui VITS ONNX is a text to speech model. Therefore, since the input data needed are English sentences, that is precisely what I'm generating inside the sentences list. I don't see any problem with that. What do you mean by carefully selected real datasets? And what would be the difference with what I'm inputting?

On the other hand, when I run my script, no "augmented_model.onnx" is created. Because of that, I can't share it with you. However, at some point there was an augmented_model.onnx created, but I deleted it because it was from the bert static quantization example I believe. Isn't it supposedly created after every instance?

Here's the script used to convert the VITS model to ONNX. It may give you some insights on the workings of the dummy inputs and the expected input.

Kind Regards

mllopartbsc commented 1 year ago

In regards to the last comment, I commented out the following section of my script:

# Generate 50 random sentences
subjects = ["I", "You", "Bob", "Alice", "The cat", "The robot"]
verbs = ["like", "hate", "see", "touch", "admire", "love"]
objects = ["apples", "the moon", "the rain", "a beautiful painting", "the idea of existence", "the sound of the ocean"]

sentences = []
for i in range(50):
    subject = random.choice(subjects)
    verb = random.choice(verbs)
    object = random.choice(objects)
    sentence = f"{subject} {verb} {object}."
    sentences.append(sentence)

And changed for this hand-made list of 50 English sentences:

sentences = [
    "The sky is blue.",
    "I love apples.",
    "She is reading a book.",
    "I have a pet dog.",
    "I like to play soccer.",
    "Python is a powerful language.",
    "The sun sets in the west.",
    "The food here is delicious.",
    "I am going to the park.",
    "The cake is in the oven.",
    "He is playing the guitar.",
    "It is raining outside.",
    "I am baking cookies.",
    "I enjoy reading novels.",
    "She likes to play tennis.",
    "We are going on a trip.",
    "I am learning Spanish.",
    "He is practicing the piano.",
    "She loves to dance.",
    "The dog is sleeping.",
    "I am eating a sandwich.",
    "They are watching a movie.",
    "She has a red bicycle.",
    "I am visiting my grandparents.",
    "I lost my keys.",
    "The birds are singing.",
    "I am drinking coffee.",
    "He is studying for the exam.",
    "I went to the beach.",
    "I am learning to cook.",
    "The cat is playing with a ball.",
    "She is brushing her hair.",
    "He has a blue car.",
    "I am painting a picture.",
    "She is feeding the birds.",
    "The pizza is delicious.",
    "I saw a beautiful sunset.",
    "I am playing video games.",
    "She is knitting a scarf.",
    "They are planting flowers.",
    "The moon is full tonight.",
    "I am writing a letter.",
    "The ice cream is melting.",
    "She is washing the dishes.",
    "I am going for a run.",
    "He is fixing the computer.",
    "I am listening to music.",
    "The coffee is hot.",
    "I am cleaning the house.",
    "He is driving a truck."
]

So that they're not randomly generated.

Without modifying the path of the augmented model in the calibrate.py script, this is the error that emerges:

WARNING:root:Please consider to run pre-processing before quantization. Refer to example: https://github.com/microsoft/onnxruntime-inference-examples/blob/main/quantization/image_classification/cpu/ReadMe.md 
2023-07-17 16:41:31.575414164 [W:onnxruntime:, execution_frame.cc:651 AllocateMLValueTensorPreAllocateBuffer] Shape mismatch attempting to re-use buffer. {1} != {0}. Validate usage of dim_value (values should be > 0) and dim_param (all values with the same string should equate to the same size) in shapes in the model.
2023-07-17 16:41:31.575516224 [E:onnxruntime:, sequential_executor.cc:514 ExecuteKernel] Non-zero status code returned while running Reshape node. Name:'/duration_predictor/flows.2/Reshape_27_output_0_ReduceMax_Reshape' Status Message: /onnxruntime_src/onnxruntime/core/providers/cpu/tensor/reshape_helper.h:40 onnxruntime::ReshapeHelper::ReshapeHelper(const onnxruntime::TensorShape&, onnxruntime::TensorShapeVector&, bool) gsl::narrow_cast<int64_t>(input_shape.Size()) == size was false. The input tensor cannot be reshaped to the requested shape. Input shape:{0}, requested shape:{1}

Traceback (most recent call last):
  File "/home/mllopart/PycharmProjects/ttsAPI/tts-api/server/static_quantization.py", line 196, in <module>
    quantize_static(model_name, quantize_name, calibration_data_reader=DataReader(x, x_lengths, scales), quant_format=QuantFormat.QDQ)
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/quantize.py", line 369, in quantize_static
    calibrator.collect_data(calibration_data_reader)
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/calibrate.py", line 243, in collect_data
    self.intermediate_outputs.append(self.infer_session.run(None, inputs))
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/capi/onnxruntime_inference_collection.py", line 217, in run
    return self._sess.run(output_names, input_feed, run_options)
onnxruntime.capi.onnxruntime_pybind11_state.RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running Reshape node. Name:'/duration_predictor/flows.2/Reshape_27_output_0_ReduceMax_Reshape' Status Message: /onnxruntime_src/onnxruntime/core/providers/cpu/tensor/reshape_helper.h:40 onnxruntime::ReshapeHelper::ReshapeHelper(const onnxruntime::TensorShape&, onnxruntime::TensorShapeVector&, bool) gsl::narrow_cast<int64_t>(input_shape.Size()) == size was false. The input tensor cannot be reshaped to the requested shape. Input shape:{0}, requested shape:{1}

If I modify the path of the session to the VITS model, then this error emerges from the quantize.py script:

WARNING:root:failed to infer the type of tensor: . Skip to quantize it. Please check if it is expected.
WARNING:root:failed to infer the type of tensor: . Skip to quantize it. Please check if it is expected.
.
.
.
WARNING:root:failed to infer the type of tensor: . Skip to quantize it. Please check if it is expected.
Traceback (most recent call last):
  File "/home/mllopart/PycharmProjects/ttsAPI/tts-api/server/static_quantization.py", line 201, in <module>
    quantize_static(model_name, quantize_name, calibration_data_reader=DataReader(x, x_lengths, scales), quant_format=QuantFormat.QDQ)
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/quantize.py", line 406, in quantize_static
    quantizer.quantize_model()
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/qdq_quantizer.py", line 217, in quantize_model
    self._quantize_normal_tensors()
  File "/home/mllopart/PycharmProjects/ttsAPI/venv/lib/python3.10/site-packages/onnxruntime/quantization/qdq_quantizer.py", line 385, in _quantize_normal_tensors
    raise ValueError(
ValueError: Quantization parameters are not specified for param /text_encoder/Constant_output_0. In static mode quantization params for inputs and outputs of nodes to be quantized are required.

Therefore, I get the same errors if I don't generate the input data randomly.

Again, thank you for the quick answer.

smallbraingames commented 2 months ago

I am running into this as well–has it been resolved?

mllopartbsc commented 2 months ago

I am running into this as well–has it been resolved?

No, it hasn't.