encoder_hidden_states in the onnx inputs

ELS-RD / transformer-deploy

Efficient, scalable and enterprise-grade CPU/GPU inference server for 🤗 Hugging Face transformer models 🚀

Apache License 2.0

1.65k stars 150 forks source link

For the onnxruntime I plan to send encoder_hidden_states as part of inputs as well

input_ids = {"input_ids": decoder_input_ids, "encoder_hidden_states": encoder_hidden_states}
# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type
for k, v in input_ids.items():
    if not isinstance(v, torch.Tensor):
        continue
    if v.dtype in [torch.long, torch.int64]:
        input_ids[k] = v.type(torch.int32)

convert_to_onnx(
    model_pytorch=decoder_model,
    output_path="test-gpt2.onnx",
    inputs_pytorch=dict(input_ids),
    quantization=False,
    var_output_seq=True,
    output_names=["output"],  # we inform ONNX export tool that the output shape will vary with the input shape
)
# model may switch to train mode for some unknown reasons, we force the eval mode.
_ = decoder_model.eval()

but it seems like the inputs are not carried onto the forward pass in dictionary and that encoder_hidden_states is being set as past_key_values which should have been None(by default), hence the issue

File /usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py:801, in GPT2Model.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
    799     past_key_values = tuple([None] * len(self.h))
    800 else:
--> 801     past_length = past_key_values[0][0].size(-2)
    802 if position_ids is None:
    803     position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)

Complete error stack:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In [16], line 9
      6     if v.dtype in [torch.long, torch.int64]:
      7         input_ids[k] = v.type(torch.int32)
----> 9 convert_to_onnx(
     10     model_pytorch=decoder_model,
     11     output_path="test-gpt2.onnx",
     12     inputs_pytorch=dict(input_ids),
     13     quantization=False,
     14     var_output_seq=True,
     15     output_names=["output"],  # we inform ONNX export tool that the output shape will vary with the input shape
     16 )
     17 # model may switch to train mode for some unknown reasons, we force the eval mode.
     18 _ = decoder_model.eval()

File /usr/local/lib/python3.8/dist-packages/transformer_deploy/backends/pytorch_utils.py:158, in convert_to_onnx(model_pytorch, output_path, inputs_pytorch, quantization, var_output_seq, output_names)
    156     input_names = list(inputs_pytorch.keys())
    157 with torch.no_grad():
--> 158     torch.onnx.export(
    159         model_pytorch,  # model to optimize
    160         args=tuple(inputs_pytorch.values()),  # tuple of multiple inputs
    161         f=output_path,  # output path / file object
    162         opset_version=13,  # the ONNX version to use, >= 13 supports channel quantized model
    163         do_constant_folding=True,  # simplify model (replace constant expressions)
    164         input_names=input_names,  # input names
    165         output_names=output_names,  # output names
    166         dynamic_axes=dynamic_axis,  # declare dynamix axis for each input / output
    167         training=TrainingMode.EVAL,  # always put the model in evaluation mode
    168         verbose=False,
    169     )
    170 proto = onnx.load(output_path, load_external_data=False)
    171 save_onnx(proto=proto, model_path=output_path)

File /usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py:350, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
     74 r"""
     75 Exports a model into ONNX format. If ``model`` is not a
     76 :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs
   (...)
    345     model to the file ``f`` even if this is raised.
    346 """
    348 from torch.onnx import utils
--> 350 return utils.export(
    351     model,
    352     args,
    353     f,
    354     export_params,
    355     verbose,
    356     training,
    357     input_names,
    358     output_names,
    359     operator_export_type,
    360     opset_version,
    361     do_constant_folding,
    362     dynamic_axes,
    363     keep_initializers_as_inputs,
    364     custom_opsets,
    365     export_modules_as_functions,
    366 )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:163, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
    145 def export(
    146     model,
    147     args,
   (...)
    160     export_modules_as_functions=False,
    161 ):
--> 163     _export(
    164         model,
    165         args,
    166         f,
    167         export_params,
    168         verbose,
    169         training,
    170         input_names,
    171         output_names,
    172         operator_export_type=operator_export_type,
    173         opset_version=opset_version,
    174         do_constant_folding=do_constant_folding,
    175         dynamic_axes=dynamic_axes,
    176         keep_initializers_as_inputs=keep_initializers_as_inputs,
    177         custom_opsets=custom_opsets,
    178         export_modules_as_functions=export_modules_as_functions,
    179     )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:1074, in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size, custom_opsets, add_node_names, onnx_shape_inference, export_modules_as_functions)
   1071     dynamic_axes = {}
   1072 _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-> 1074 graph, params_dict, torch_out = _model_to_graph(
   1075     model,
   1076     args,
   1077     verbose,
   1078     input_names,
   1079     output_names,
   1080     operator_export_type,
   1081     val_do_constant_folding,
   1082     fixed_batch_size=fixed_batch_size,
   1083     training=training,
   1084     dynamic_axes=dynamic_axes,
   1085 )
   1087 # TODO: Don't allocate a in-memory string for the protobuf
   1088 defer_weight_export = (
   1089     export_type is not torch.onnx.ExportTypes.PROTOBUF_FILE
   1090 )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:727, in _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size, training, dynamic_axes)
    724     args = (args,)
    726 model = _pre_trace_quant_model(model, args)
--> 727 graph, params, torch_out, module = _create_jit_graph(model, args)
    728 params_dict = _get_named_param_dict(graph, params)
    730 try:

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:602, in _create_jit_graph(model, args)
    600     return graph, params, torch_out, None
    601 else:
--> 602     graph, torch_out = _trace_and_get_graph_from_model(model, args)
    603     _C._jit_pass_onnx_lint(graph)
    604     state_dict = torch.jit._unique_state_dict(model)

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:517, in _trace_and_get_graph_from_model(model, args)
    512 def _trace_and_get_graph_from_model(model, args):
    513     # A basic sanity check: make sure the state_dict keys are the same
    514     # before and after running the model.  Fail fast!
    515     orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
--> 517     trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
    518         model, args, strict=False, _force_outplace=False, _return_inputs_states=True
    519     )
    520     warn_on_static_input_change(inputs_states)
    522     if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():

File /usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py:1175, in _get_trace_graph(f, args, kwargs, strict, _force_outplace, return_inputs, _return_inputs_states)
   1173 if not isinstance(args, tuple):
   1174     args = (args,)
-> 1175 outs = ONNXTracedModule(f, strict, _force_outplace, return_inputs, _return_inputs_states)(*args, **kwargs)
   1176 return outs

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py:127, in ONNXTracedModule.forward(self, *args)
    124     else:
    125         return tuple(out_vars)
--> 127 graph, out = torch._C._create_graph_by_tracing(
    128     wrapper,
    129     in_vars + module_state,
    130     _create_interpreter_name_lookup_fn(),
    131     self.strict,
    132     self._force_outplace,
    133 )
    135 if self._return_inputs:
    136     return graph, outs[0], ret_inputs[0]

File /usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py:118, in ONNXTracedModule.forward.<locals>.wrapper(*args)
    116 if self._return_inputs_states:
    117     inputs_states.append(_unflatten(in_args, in_desc))
--> 118 outs.append(self.inner(*trace_inputs))
    119 if self._return_inputs_states:
    120     inputs_states[0] = (inputs_states[0], trace_inputs)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1118, in Module._slow_forward(self, *input, **kwargs)
   1116         recording_scopes = False
   1117 try:
-> 1118     result = self.forward(*input, **kwargs)
   1119 finally:
   1120     if recording_scopes:

File /usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py:1058, in GPT2LMHeadModel.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1050 r"""
   1051 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
   1052     Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
   1053     `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
   1054     are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
   1055 """
   1056 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1058 transformer_outputs = self.transformer(
   1059     input_ids,
   1060     past_key_values=past_key_values,
   1061     attention_mask=attention_mask,
   1062     token_type_ids=token_type_ids,
   1063     position_ids=position_ids,
   1064     head_mask=head_mask,
   1065     inputs_embeds=inputs_embeds,
   1066     encoder_hidden_states=encoder_hidden_states,
   1067     encoder_attention_mask=encoder_attention_mask,
   1068     use_cache=use_cache,
   1069     output_attentions=output_attentions,
   1070     output_hidden_states=output_hidden_states,
   1071     return_dict=return_dict,
   1072 )
   1073 hidden_states = transformer_outputs[0]
   1075 # Set device for model parallelism

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1118, in Module._slow_forward(self, *input, **kwargs)
   1116         recording_scopes = False
   1117 try:
-> 1118     result = self.forward(*input, **kwargs)
   1119 finally:
   1120     if recording_scopes:

File /usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py:801, in GPT2Model.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
    799     past_key_values = tuple([None] * len(self.h))
    800 else:
--> 801     past_length = past_key_values[0][0].size(-2)
    802 if position_ids is None:
    803     position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)

Am I missing something? Please help @pommedeterresautee

input_ids = {"input_ids": decoder_input_ids.cpu(),"past_key_values": None, "attention_mask": None, "token_type_ids": None,"position_ids": None,"head_mask":None, "inputs_embeds": None, "encoder_hidden_states": encoder_hidden_states.cpu()} # some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type for k, v in input_ids.items(): if not isinstance(v, torch.Tensor): continue if v.dtype in [torch.long, torch.int64]: input_ids[k] = v.type(torch.int32) convert_to_onnx( model_pytorch=decoder_model, output_path="test-gpt2.onnx", inputs_pytorch=dict(input_ids), quantization=False, var_output_seq=True, output_names=["output"], # we inform ONNX export tool that the output shape will vary with the input shape ) # model may switch to train mode for some unknown reasons, we force the eval mode. _ = decoder_model.eval()

--------------------------------------------------------------------------- RuntimeError Traceback (most recent call last) Cell In [27], line 50 47 input_names = list(input_ids.keys()) 49 with torch.no_grad(): ---> 50 torch.onnx.export( 51 decoder_model, # model to optimize 52 args=tuple(input_ids.values()), # tuple of multiple inputs 53 f="test-gpt2.onnx", # output path / file object 54 opset_version=13, # the ONNX version to use, >= 13 supports channel quantized model 55 do_constant_folding=True, # simplify model (replace constant expressions) 56 input_names=input_names, # input names 57 output_names=["output"], # output names 58 dynamic_axes=dynamic_axis, # declare dynamix axis for each input / output 59 training=TrainingMode.EVAL, # always put the model in evaluation mode 60 verbose=False, 61 ) 62 proto = onnx.load("test-gpt2.onnx", load_external_data=False) 63 save_onnx(proto=proto, model_path="test-gpt2.onnx") File /usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py:350, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions) 74 r""" 75 Exports a model into ONNX format. If ``model`` is not a 76 :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs (...) 345 model to the file ``f`` even if this is raised. 346 """ 348 from torch.onnx import utils --> 350 return utils.export( 351 model, 352 args, 353 f, 354 export_params, 355 verbose, 356 training, 357 input_names, 358 output_names, 359 operator_export_type, 360 opset_version, 361 do_constant_folding, 362 dynamic_axes, 363 keep_initializers_as_inputs, 364 custom_opsets, 365 export_modules_as_functions, 366 ) File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:163, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions) 145 def export( 146 model, 147 args, (...) 160 export_modules_as_functions=False, 161 ): --> 163 _export( 164 model, 165 args, 166 f, 167 export_params, 168 verbose, 169 training, 170 input_names, 171 output_names, 172 operator_export_type=operator_export_type, 173 opset_version=opset_version, 174 do_constant_folding=do_constant_folding, 175 dynamic_axes=dynamic_axes, 176 keep_initializers_as_inputs=keep_initializers_as_inputs, 177 custom_opsets=custom_opsets, 178 export_modules_as_functions=export_modules_as_functions, 179 ) File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:1074, in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size, custom_opsets, add_node_names, onnx_shape_inference, export_modules_as_functions) 1071 dynamic_axes = {} 1072 _validate_dynamic_axes(dynamic_axes, model, input_names, output_names) -> 1074 graph, params_dict, torch_out = _model_to_graph( 1075 model, 1076 args, 1077 verbose, 1078 input_names, 1079 output_names, 1080 operator_export_type, 1081 val_do_constant_folding, 1082 fixed_batch_size=fixed_batch_size, 1083 training=training, 1084 dynamic_axes=dynamic_axes, 1085 ) 1087 # TODO: Don't allocate a in-memory string for the protobuf 1088 defer_weight_export = ( 1089 export_type is not torch.onnx.ExportTypes.PROTOBUF_FILE 1090 ) File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:731, in _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size, training, dynamic_axes) 728 params_dict = _get_named_param_dict(graph, params) 730 try: --> 731 graph = _optimize_graph( 732 graph, 733 operator_export_type, 734 _disable_torch_constant_prop=_disable_torch_constant_prop, 735 fixed_batch_size=fixed_batch_size, 736 params_dict=params_dict, 737 dynamic_axes=dynamic_axes, 738 input_names=input_names, 739 module=module, 740 ) 741 except Exception as e: 742 torch.onnx.log("Torch IR graph at exception: ", graph) File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:306, in _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop, fixed_batch_size, params_dict, dynamic_axes, input_names, module) 304 input_names = [] if input_names is None else input_names 305 dynamic_axes = {} if dynamic_axes is None else dynamic_axes --> 306 _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names) 307 _C._jit_pass_onnx_lint(graph) 308 graph = _C._jit_pass_onnx(graph, operator_export_type) RuntimeError: Dynamic shape axis should be no more than the shape dimension for sequence

input_ids = {"input_ids": decoder_input_ids.cpu(),"encoder_hidden_states": encoder_hidden_states.cpu()} # some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type for k, v in input_ids.items(): if not isinstance(v, torch.Tensor): continue if v.dtype in [torch.long, torch.int64]: input_ids[k] = v.type(torch.int32) if hasattr(decoder_model, "config") and hasattr(decoder_model.config, "use_cache"): use_cache = getattr(decoder_model.config, "use_cache") setattr(decoder_model.config, "use_cache", False) dynamic_axis = dict() for k in input_ids.keys(): if True: # seq axis name is fixed to be matched with output seq axis name (for output shape prediction) dynamic_axis[k] = {0: "batch_size", 1: "sequence"} else: # if there is no specific requirement, each axis name is unique, fix some issue on T5 model dynamic_axis[k] = {0: "batch_size", 1: f"sequence-{k}"} for output_name in ["output"]: dynamic_axis[output_name] = {0: "batch_size"} if True: dynamic_axis[output_name][1] = "sequence" print(f"DYNAMIC AXIS: {dynamic_axis}") # get input names in the same order as in the model forward model_args = decoder_model.forward.__code__.co_varnames input_names = [] for arg_name in model_args: if arg_name in input_ids.keys(): input_names.append(arg_name) # sentence transformer model forward is kargs and kwargs if len(input_names) == 0: input_names = list(input_ids.keys()) with torch.no_grad(): torch.onnx.export( decoder_model, # model to optimize # args=tuple(input_ids.values()), # tuple of multiple inputs args=input_ids, f="test-gpt2.onnx", # output path / file object opset_version=13, # the ONNX version to use, >= 13 supports channel quantized model do_constant_folding=True, # simplify model (replace constant expressions) input_names=input_names, # input names output_names=["output"], # output names dynamic_axes=dynamic_axis, # declare dynamix axis for each input / output training=TrainingMode.EVAL, # always put the model in evaluation mode verbose=False, ) proto = onnx.load("test-gpt2.onnx", load_external_data=False) save_onnx(proto=proto, model_path="test-gpt2.onnx") if False: TensorQuantizer.use_fb_fake_quant = False if hasattr(decoder_model, "config") and hasattr(decoder_model.config, "use_cache"): setattr(decoder_model.config, "use_cache", use_cache)

logging.basicConfig() logging.getLogger().setLevel(logging.INFO) num_attention_heads, hidden_size = get_model_size(path=DECODER_MODEL) optimize_onnx( onnx_path="test-gpt2.onnx", onnx_optim_model_path="test-gpt2-opt.onnx", fp16=True, use_cuda=True, num_attention_heads=num_attention_heads, hidden_size=hidden_size, architecture="gpt2", )

INFO:fusion_utils:Removed 26 Cast nodes with output type same as input INFO:fusion_base:Fused LayerNormalization count: 19 INFO:fusion_base:Fused FastGelu count: 6 --------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In [61], line 4 2 logging.getLogger().setLevel(logging.INFO) 3 num_attention_heads, hidden_size = get_model_size(path=DECODER_MODEL) ----> 4 optimize_onnx( 5 onnx_path="test-gpt2.onnx", 6 onnx_optim_model_path="test-gpt2-opt.onnx", 7 fp16=True, 8 use_cuda=True, 9 num_attention_heads=num_attention_heads, 10 hidden_size=hidden_size, 11 architecture="gpt2", 12 ) File /usr/local/lib/python3.8/dist-packages/transformer_deploy/backends/ort_utils.py:117, in optimize_onnx(onnx_path, onnx_optim_model_path, fp16, use_cuda, num_attention_heads, hidden_size, architecture) 115 architecture = "bert" 116 opt_level = 1 if architecture == "bert" else 0 --> 117 optimized_model: BertOnnxModel = optimizer.optimize_model( 118 input=onnx_path, 119 model_type=architecture, 120 use_gpu=use_cuda, 121 opt_level=opt_level, 122 num_heads=num_attention_heads, # automatic detection with 0 may not work with opset 13 or distilbert models 123 hidden_size=hidden_size, # automatic detection with 0 124 optimization_options=optimization_options, 125 ) 126 if fp16: 127 # use_symbolic_shape_infer set to false because doesn't work after ONNX package v1.10.2 128 optimized_model.convert_float_to_float16(use_symbolic_shape_infer=False) # FP32 -> FP16 File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/optimizer.py:253, in optimize_model(input, model_type, num_heads, hidden_size, optimization_options, opt_level, use_gpu, only_onnxruntime) 251 optimizer = optimizer_class(model, num_heads, hidden_size) 252 else: --> 253 optimizer = optimize_by_fusion(model, model_type, num_heads, hidden_size, optimization_options) 255 # Remove the temporary model. 256 if temp_model_path: File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/optimizer.py:153, in optimize_by_fusion(model, model_type, num_heads, hidden_size, optimization_options) 149 optimization_options = FusionOptions(model_type) 151 optimizer = optimizer_class(model, num_heads, hidden_size) --> 153 optimizer.optimize(optimization_options) 155 optimizer.topological_sort() 157 optimizer.model.producer_name = "onnxruntime.transformers" File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../onnx_model_bert.py:360, in BertOnnxModel.optimize(self, options, add_dynamic_axes) 358 if options is not None: 359 self.attention_mask.set_mask_format(options.attention_mask_format) --> 360 self.fuse_attention() 362 self.fuse_shape() 364 if (options is None) or options.enable_embed_layer_norm: File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../onnx_model_gpt2.py:23, in Gpt2OnnxModel.fuse_attention(self) 21 if len(self.model.graph.input) == 1 or len(self.model.graph.output) == 1: 22 fusion = FusionGptAttentionNoPast(self, self.num_heads) ---> 23 fusion.apply() 24 else: 25 fusion = FusionGptAttention(self, self.num_heads) File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../fusion_base.py:46, in Fusion.apply(self) 44 raise Exception("Can not find node in any graphs") 45 self.this_graph_name = graph.name ---> 46 self.fuse(node, input_name_to_nodes, output_name_to_node) 48 op_list = [node.op_type for node in self.nodes_to_add] 49 count = max(self.fused_count, op_list.count(self.fused_op_type)) File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../fusion_gpt_attention_no_past.py:104, in FusionGptAttentionNoPast.fuse(self, normalize_node, input_name_to_nodes, output_name_to_node) 102 layernorm_before_attention = self.model.get_parent(reshape_before_gemm, 0, output_name_to_node) 103 if layernorm_before_attention is None or layernorm_before_attention.op_type != "LayerNormalization": --> 104 if layernorm_before_attention.op_type != "Add": 105 logger.debug(f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}") 106 return AttributeError: 'NoneType' object has no attribute 'op_type'

ELS-RD / transformer-deploy

encoder_hidden_states in the onnx inputs #139