microsoft / onnxruntime

ONNX Runtime: cross-platform, high performance ML inferencing and training accelerator
https://onnxruntime.ai
MIT License
14.22k stars 2.87k forks source link

Quantized model does not support dynamic axes #8169

Open rohanshingade opened 3 years ago

rohanshingade commented 3 years ago

Describe the bug Onnx model converted from pytorch supports dynamic axes. where as the quantized onnx model does not support dynamic axes

System information

To Reproduce

import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"
os.environ["OMP_NUM_THREADS"]="5"

import numpy as np
import onnxruntime
import torch
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification
)

from onnxruntime.quantization import quantize_qat, QuantType

# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base')
device = torch.device("cpu")
model.eval()
model.to(device)

# sample texts with max length = 35
# this will create a sample input of 6x35
max_seq_length = 35
sample_texts = ['the impact of the Delta plus strain will become clear only as more cases are identified',
                'the impact of the Delta plus strain will become clear only as more cases are identified',
                'The BBC said the documents suggested the ship’s mission was conducted in the expectation that Russia might respond aggressively.',
                'You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.',
                'Some of these countries are slated to open their borders in July, and in some direct flights are not yet operational.',
                'So do check the dates and details before firming up any travel plans.']
tokenized_ouputs = tokenizer.batch_encode_plus(
            sample_texts,
            max_length=max_seq_length,
            padding="max_length",
            return_tensors='pt',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )

# conver the model to onnx model using the sample text as input
with torch.no_grad():
    symbolic_names = {0: 'batch_size', 1:'max_seq_len'}
    torch.onnx.export(model,
                      args=tuple(tokenized_ouputs.values()),
                      f='xlmronnx_model.onnx',
                      opset_version=11,
                      do_constant_folding=False,
                      input_names=['input_ids', 'attention_mask'],
                      output_names=['logits'],
                      dynamic_axes={'input_ids': symbolic_names, 'attention_mask': symbolic_names})

# created a session for the onnx model to test
session = onnxruntime.InferenceSession('xlmronnx_model.onnx', providers=['CPUExecutionProvider'])

# test 1: text of length 35
test1 = tokenizer.batch_encode_plus(
            ["A text can be any example of written or spoken language"],
            max_length=35,
            padding='max_length',
            return_tensors=None,
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )
input_dict_np1 = {'input_ids':np.array(test1['input_ids']), 'attention_mask':np.array(test1['attention_mask'])}

# test 2: same text in test1, but of length 45
test2 = tokenizer.batch_encode_plus(
            ["A text can be any example of written or spoken language", "Five dead in mexico in hot air ballon crash"],
            max_length=45,
            padding='max_length',
            return_tensors=None,
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )
input_dict_np2 = {'input_ids':np.array(test2['input_ids']), 'attention_mask':np.array(test2['attention_mask'])}

# test 3: same text in test1, but of length 45 with truncation
test3 = tokenizer.batch_encode_plus(
            ["A text can be any example of written or spoken language"],
            max_length=45,
            padding=False,
            return_tensors=None,
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True
        )
input_dict_np3 = {'input_ids':np.array(test3['input_ids']), 'attention_mask':np.array(test3['attention_mask'])}

# print output of test1
print(session.run(None, input_dict_np1)[0])

# print output of test2
print(session.run(None, input_dict_np2)[0])

# print output of test3
print(session.run(None, input_dict_np3)[0])

# quantize the above model
quantized_model = quantize_qat('xlmronnx_model.onnx', 'xlmronnx_model_quantized.onnx')
quantized_session = onnxruntime.InferenceSession('xlmronnx_model_quantized.onnx', providers=['CPUExecutionProvider'])

# print output of test1 for quantized model
print(quantized_session.run(None, input_dict_np1)[0])

# print output of test2 for quantized model
print(quantized_session.run(None, input_dict_np2)[0])

# print output of test2 for quantized model
print(quantized_session.run(None, input_dict_np3)[0])
stale[bot] commented 2 years ago

This issue has been automatically marked as stale due to inactivity and will be closed in 7 days if no further activity occurs. If further support is needed, please provide an update and/or more details.