Quantized model does not support dynamic axes

Describe the bug Onnx model converted from pytorch supports dynamic axes. where as the quantized onnx model does not support dynamic axes

System information

OS Platform and Distribution (e.g., Linux Ubuntu 16.04): Linux Ubuntu 16.04
ONNX Runtime installed from (source or binary): source
ONNX Runtime version: 1.8.0
Python version: 3.6.9
Visual Studio version (if applicable):
GCC/Compiler version (if compiling from source):
CUDA/cuDNN version: no
GPU model and memory: no

To Reproduce

import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.environ["OMP_NUM_THREADS"]="5"

import numpy as np
import onnxruntime
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification
)

from onnxruntime.quantization import quantize_qat, QuantType

# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base')
device = torch.device("cpu")
model.eval()
model.to(device)

# sample texts with max length = 35
# this will create a sample input of 6x35
max_seq_length = 35
sample_texts = ['the impact of the Delta plus strain will become clear only as more cases are identified',
                'the impact of the Delta plus strain will become clear only as more cases are identified',
                'The BBC said the documents suggested the ship’s mission was conducted in the expectation that Russia might respond aggressively.',
                'You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.',
                'Some of these countries are slated to open their borders in July, and in some direct flights are not yet operational.',
                'So do check the dates and details before firming up any travel plans.']
tokenized_ouputs = tokenizer.batch_encode_plus(
            sample_texts,
            max_length=max_seq_length,
            padding="max_length",
            return_tensors='pt',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )

# conver the model to onnx model using the sample text as input
with torch.no_grad():
    symbolic_names = {0: 'batch_size', 1:'max_seq_len'}
    torch.onnx.export(model,
                      args=tuple(tokenized_ouputs.values()),
                      f='xlmronnx_model.onnx',
                      opset_version=11,
                      do_constant_folding=False,
                      input_names=['input_ids', 'attention_mask'],
                      output_names=['logits'],
                      dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names})

# created a session for the onnx model to test
session = onnxruntime.InferenceSession('xlmronnx_model.onnx', providers=['CPUExecutionProvider'])

# test 1: text of length 35
test1 = tokenizer.batch_encode_plus(
            ["A text can be any example of written or spoken language"],
            max_length=35,
            padding='max_length',
            return_tensors=None,
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )
input_dict_np1 = {'input_ids':np.array(test1['input_ids']), 'attention_mask':np.array(test1['attention_mask'])}

# test 2: same text in test1, but of length 45
test2 = tokenizer.batch_encode_plus(
            ["A text can be any example of written or spoken language", "Five dead in mexico in hot air ballon crash"],
            max_length=45,
            padding='max_length',
            return_tensors=None,
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )
input_dict_np2 = {'input_ids':np.array(test2['input_ids']), 'attention_mask':np.array(test2['attention_mask'])}

# test 3: same text in test1, but of length 45 with truncation
test3 = tokenizer.batch_encode_plus(
            ["A text can be any example of written or spoken language"],
            max_length=45,
            padding=False,
            return_tensors=None,
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )
input_dict_np3 = {'input_ids':np.array(test3['input_ids']), 'attention_mask':np.array(test3['attention_mask'])}

# print output of test1
print(session.run(None, input_dict_np1)[0])

# print output of test2
print(session.run(None, input_dict_np2)[0])

# print output of test3
print(session.run(None, input_dict_np3)[0])

# quantize the above model
quantized_model = quantize_qat('xlmronnx_model.onnx', 'xlmronnx_model_quantized.onnx')
quantized_session = onnxruntime.InferenceSession('xlmronnx_model_quantized.onnx', providers=['CPUExecutionProvider'])

# print output of test1 for quantized model
print(quantized_session.run(None, input_dict_np1)[0])

# print output of test2 for quantized model
print(quantized_session.run(None, input_dict_np2)[0])

# print output of test2 for quantized model
print(quantized_session.run(None, input_dict_np3)[0])

microsoft / onnxruntime

Quantized model does not support dynamic axes #8169