Closed pngmafia closed 2 years ago
Tried using
input_ids = {"input_ids": decoder_input_ids.cpu(),"past_key_values": None, "attention_mask": None, "token_type_ids": None,"position_ids": None,"head_mask":None, "inputs_embeds": None, "encoder_hidden_states": encoder_hidden_states.cpu()}
# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type
for k, v in input_ids.items():
if not isinstance(v, torch.Tensor):
continue
if v.dtype in [torch.long, torch.int64]:
input_ids[k] = v.type(torch.int32)
convert_to_onnx(
model_pytorch=decoder_model,
output_path="test-gpt2.onnx",
inputs_pytorch=dict(input_ids),
quantization=False,
var_output_seq=True,
output_names=["output"], # we inform ONNX export tool that the output shape will vary with the input shape
)
# model may switch to train mode for some unknown reasons, we force the eval mode.
_ = decoder_model.eval()
The values are getting passed to the correct variables but theres an issue with dynamic sizes
Complete Error Stack:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In [27], line 50
47 input_names = list(input_ids.keys())
49 with torch.no_grad():
---> 50 torch.onnx.export(
51 decoder_model, # model to optimize
52 args=tuple(input_ids.values()), # tuple of multiple inputs
53 f="test-gpt2.onnx", # output path / file object
54 opset_version=13, # the ONNX version to use, >= 13 supports channel quantized model
55 do_constant_folding=True, # simplify model (replace constant expressions)
56 input_names=input_names, # input names
57 output_names=["output"], # output names
58 dynamic_axes=dynamic_axis, # declare dynamix axis for each input / output
59 training=TrainingMode.EVAL, # always put the model in evaluation mode
60 verbose=False,
61 )
62 proto = onnx.load("test-gpt2.onnx", load_external_data=False)
63 save_onnx(proto=proto, model_path="test-gpt2.onnx")
File /usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py:350, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
74 r"""
75 Exports a model into ONNX format. If ``model`` is not a
76 :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs
(...)
345 model to the file ``f`` even if this is raised.
346 """
348 from torch.onnx import utils
--> 350 return utils.export(
351 model,
352 args,
353 f,
354 export_params,
355 verbose,
356 training,
357 input_names,
358 output_names,
359 operator_export_type,
360 opset_version,
361 do_constant_folding,
362 dynamic_axes,
363 keep_initializers_as_inputs,
364 custom_opsets,
365 export_modules_as_functions,
366 )
File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:163, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
145 def export(
146 model,
147 args,
(...)
160 export_modules_as_functions=False,
161 ):
--> 163 _export(
164 model,
165 args,
166 f,
167 export_params,
168 verbose,
169 training,
170 input_names,
171 output_names,
172 operator_export_type=operator_export_type,
173 opset_version=opset_version,
174 do_constant_folding=do_constant_folding,
175 dynamic_axes=dynamic_axes,
176 keep_initializers_as_inputs=keep_initializers_as_inputs,
177 custom_opsets=custom_opsets,
178 export_modules_as_functions=export_modules_as_functions,
179 )
File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:1074, in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size, custom_opsets, add_node_names, onnx_shape_inference, export_modules_as_functions)
1071 dynamic_axes = {}
1072 _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-> 1074 graph, params_dict, torch_out = _model_to_graph(
1075 model,
1076 args,
1077 verbose,
1078 input_names,
1079 output_names,
1080 operator_export_type,
1081 val_do_constant_folding,
1082 fixed_batch_size=fixed_batch_size,
1083 training=training,
1084 dynamic_axes=dynamic_axes,
1085 )
1087 # TODO: Don't allocate a in-memory string for the protobuf
1088 defer_weight_export = (
1089 export_type is not torch.onnx.ExportTypes.PROTOBUF_FILE
1090 )
File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:731, in _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size, training, dynamic_axes)
728 params_dict = _get_named_param_dict(graph, params)
730 try:
--> 731 graph = _optimize_graph(
732 graph,
733 operator_export_type,
734 _disable_torch_constant_prop=_disable_torch_constant_prop,
735 fixed_batch_size=fixed_batch_size,
736 params_dict=params_dict,
737 dynamic_axes=dynamic_axes,
738 input_names=input_names,
739 module=module,
740 )
741 except Exception as e:
742 torch.onnx.log("Torch IR graph at exception: ", graph)
File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:306, in _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop, fixed_batch_size, params_dict, dynamic_axes, input_names, module)
304 input_names = [] if input_names is None else input_names
305 dynamic_axes = {} if dynamic_axes is None else dynamic_axes
--> 306 _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
307 _C._jit_pass_onnx_lint(graph)
308 graph = _C._jit_pass_onnx(graph, operator_export_type)
RuntimeError: Dynamic shape axis should be no more than the shape dimension for sequence
This script worked forexporting the model to onnx but its not working for optimizing the onnx graph
input_ids = {"input_ids": decoder_input_ids.cpu(),"encoder_hidden_states": encoder_hidden_states.cpu()}
# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type
for k, v in input_ids.items():
if not isinstance(v, torch.Tensor):
continue
if v.dtype in [torch.long, torch.int64]:
input_ids[k] = v.type(torch.int32)
if hasattr(decoder_model, "config") and hasattr(decoder_model.config, "use_cache"):
use_cache = getattr(decoder_model.config, "use_cache")
setattr(decoder_model.config, "use_cache", False)
dynamic_axis = dict()
for k in input_ids.keys():
if True:
# seq axis name is fixed to be matched with output seq axis name (for output shape prediction)
dynamic_axis[k] = {0: "batch_size", 1: "sequence"}
else:
# if there is no specific requirement, each axis name is unique, fix some issue on T5 model
dynamic_axis[k] = {0: "batch_size", 1: f"sequence-{k}"}
for output_name in ["output"]:
dynamic_axis[output_name] = {0: "batch_size"}
if True:
dynamic_axis[output_name][1] = "sequence"
print(f"DYNAMIC AXIS: {dynamic_axis}")
# get input names in the same order as in the model forward
model_args = decoder_model.forward.__code__.co_varnames
input_names = []
for arg_name in model_args:
if arg_name in input_ids.keys():
input_names.append(arg_name)
# sentence transformer model forward is kargs and kwargs
if len(input_names) == 0:
input_names = list(input_ids.keys())
with torch.no_grad():
torch.onnx.export(
decoder_model, # model to optimize
# args=tuple(input_ids.values()), # tuple of multiple inputs
args=input_ids,
f="test-gpt2.onnx", # output path / file object
opset_version=13, # the ONNX version to use, >= 13 supports channel quantized model
do_constant_folding=True, # simplify model (replace constant expressions)
input_names=input_names, # input names
output_names=["output"], # output names
dynamic_axes=dynamic_axis, # declare dynamix axis for each input / output
training=TrainingMode.EVAL, # always put the model in evaluation mode
verbose=False,
)
proto = onnx.load("test-gpt2.onnx", load_external_data=False)
save_onnx(proto=proto, model_path="test-gpt2.onnx")
if False:
TensorQuantizer.use_fb_fake_quant = False
if hasattr(decoder_model, "config") and hasattr(decoder_model.config, "use_cache"):
setattr(decoder_model.config, "use_cache", use_cache)
But this isn't working
logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
num_attention_heads, hidden_size = get_model_size(path=DECODER_MODEL)
optimize_onnx(
onnx_path="test-gpt2.onnx",
onnx_optim_model_path="test-gpt2-opt.onnx",
fp16=True,
use_cuda=True,
num_attention_heads=num_attention_heads,
hidden_size=hidden_size,
architecture="gpt2",
)
INFO:fusion_utils:Removed 26 Cast nodes with output type same as input
INFO:fusion_base:Fused LayerNormalization count: 19
INFO:fusion_base:Fused FastGelu count: 6
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In [61], line 4
2 logging.getLogger().setLevel(logging.INFO)
3 num_attention_heads, hidden_size = get_model_size(path=DECODER_MODEL)
----> 4 optimize_onnx(
5 onnx_path="test-gpt2.onnx",
6 onnx_optim_model_path="test-gpt2-opt.onnx",
7 fp16=True,
8 use_cuda=True,
9 num_attention_heads=num_attention_heads,
10 hidden_size=hidden_size,
11 architecture="gpt2",
12 )
File /usr/local/lib/python3.8/dist-packages/transformer_deploy/backends/ort_utils.py:117, in optimize_onnx(onnx_path, onnx_optim_model_path, fp16, use_cuda, num_attention_heads, hidden_size, architecture)
115 architecture = "bert"
116 opt_level = 1 if architecture == "bert" else 0
--> 117 optimized_model: BertOnnxModel = optimizer.optimize_model(
118 input=onnx_path,
119 model_type=architecture,
120 use_gpu=use_cuda,
121 opt_level=opt_level,
122 num_heads=num_attention_heads, # automatic detection with 0 may not work with opset 13 or distilbert models
123 hidden_size=hidden_size, # automatic detection with 0
124 optimization_options=optimization_options,
125 )
126 if fp16:
127 # use_symbolic_shape_infer set to false because doesn't work after ONNX package v1.10.2
128 optimized_model.convert_float_to_float16(use_symbolic_shape_infer=False) # FP32 -> FP16
File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/optimizer.py:253, in optimize_model(input, model_type, num_heads, hidden_size, optimization_options, opt_level, use_gpu, only_onnxruntime)
251 optimizer = optimizer_class(model, num_heads, hidden_size)
252 else:
--> 253 optimizer = optimize_by_fusion(model, model_type, num_heads, hidden_size, optimization_options)
255 # Remove the temporary model.
256 if temp_model_path:
File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/optimizer.py:153, in optimize_by_fusion(model, model_type, num_heads, hidden_size, optimization_options)
149 optimization_options = FusionOptions(model_type)
151 optimizer = optimizer_class(model, num_heads, hidden_size)
--> 153 optimizer.optimize(optimization_options)
155 optimizer.topological_sort()
157 optimizer.model.producer_name = "onnxruntime.transformers"
File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../onnx_model_bert.py:360, in BertOnnxModel.optimize(self, options, add_dynamic_axes)
358 if options is not None:
359 self.attention_mask.set_mask_format(options.attention_mask_format)
--> 360 self.fuse_attention()
362 self.fuse_shape()
364 if (options is None) or options.enable_embed_layer_norm:
File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../onnx_model_gpt2.py:23, in Gpt2OnnxModel.fuse_attention(self)
21 if len(self.model.graph.input) == 1 or len(self.model.graph.output) == 1:
22 fusion = FusionGptAttentionNoPast(self, self.num_heads)
---> 23 fusion.apply()
24 else:
25 fusion = FusionGptAttention(self, self.num_heads)
File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../fusion_base.py:46, in Fusion.apply(self)
44 raise Exception("Can not find node in any graphs")
45 self.this_graph_name = graph.name
---> 46 self.fuse(node, input_name_to_nodes, output_name_to_node)
48 op_list = [node.op_type for node in self.nodes_to_add]
49 count = max(self.fused_count, op_list.count(self.fused_op_type))
File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../fusion_gpt_attention_no_past.py:104, in FusionGptAttentionNoPast.fuse(self, normalize_node, input_name_to_nodes, output_name_to_node)
102 layernorm_before_attention = self.model.get_parent(reshape_before_gemm, 0, output_name_to_node)
103 if layernorm_before_attention is None or layernorm_before_attention.op_type != "LayerNormalization":
--> 104 if layernorm_before_attention.op_type != "Add":
105 logger.debug(f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}")
106 return
AttributeError: 'NoneType' object has no attribute 'op_type'
@pommedeterresautee can you please help
@pngmafia Hello, can you please post the full script that you are trying to test ?
For the onnxruntime I plan to send encoder_hidden_states as part of inputs as well
but it seems like the inputs are not carried onto the forward pass in dictionary and that encoder_hidden_states is being set as past_key_values which should have been None(by default), hence the issue
Complete error stack:
Am I missing something? Please help @pommedeterresautee