ELS-RD / transformer-deploy

Efficient, scalable and enterprise-grade CPU/GPU inference server for 🤗 Hugging Face transformer models 🚀
https://els-rd.github.io/transformer-deploy/
Apache License 2.0
1.65k stars 150 forks source link

encoder_hidden_states in the onnx inputs #139

Closed pngmafia closed 2 years ago

pngmafia commented 2 years ago

For the onnxruntime I plan to send encoder_hidden_states as part of inputs as well

input_ids = {"input_ids": decoder_input_ids, "encoder_hidden_states": encoder_hidden_states}
# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type
for k, v in input_ids.items():
    if not isinstance(v, torch.Tensor):
        continue
    if v.dtype in [torch.long, torch.int64]:
        input_ids[k] = v.type(torch.int32)

convert_to_onnx(
    model_pytorch=decoder_model,
    output_path="test-gpt2.onnx",
    inputs_pytorch=dict(input_ids),
    quantization=False,
    var_output_seq=True,
    output_names=["output"],  # we inform ONNX export tool that the output shape will vary with the input shape
)
# model may switch to train mode for some unknown reasons, we force the eval mode.
_ = decoder_model.eval()

but it seems like the inputs are not carried onto the forward pass in dictionary and that encoder_hidden_states is being set as past_key_values which should have been None(by default), hence the issue

File /usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py:801, in GPT2Model.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
    799     past_key_values = tuple([None] * len(self.h))
    800 else:
--> 801     past_length = past_key_values[0][0].size(-2)
    802 if position_ids is None:
    803     position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)

Complete error stack:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
Cell In [16], line 9
      6     if v.dtype in [torch.long, torch.int64]:
      7         input_ids[k] = v.type(torch.int32)
----> 9 convert_to_onnx(
     10     model_pytorch=decoder_model,
     11     output_path="test-gpt2.onnx",
     12     inputs_pytorch=dict(input_ids),
     13     quantization=False,
     14     var_output_seq=True,
     15     output_names=["output"],  # we inform ONNX export tool that the output shape will vary with the input shape
     16 )
     17 # model may switch to train mode for some unknown reasons, we force the eval mode.
     18 _ = decoder_model.eval()

File /usr/local/lib/python3.8/dist-packages/transformer_deploy/backends/pytorch_utils.py:158, in convert_to_onnx(model_pytorch, output_path, inputs_pytorch, quantization, var_output_seq, output_names)
    156     input_names = list(inputs_pytorch.keys())
    157 with torch.no_grad():
--> 158     torch.onnx.export(
    159         model_pytorch,  # model to optimize
    160         args=tuple(inputs_pytorch.values()),  # tuple of multiple inputs
    161         f=output_path,  # output path / file object
    162         opset_version=13,  # the ONNX version to use, >= 13 supports channel quantized model
    163         do_constant_folding=True,  # simplify model (replace constant expressions)
    164         input_names=input_names,  # input names
    165         output_names=output_names,  # output names
    166         dynamic_axes=dynamic_axis,  # declare dynamix axis for each input / output
    167         training=TrainingMode.EVAL,  # always put the model in evaluation mode
    168         verbose=False,
    169     )
    170 proto = onnx.load(output_path, load_external_data=False)
    171 save_onnx(proto=proto, model_path=output_path)

File /usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py:350, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
     74 r"""
     75 Exports a model into ONNX format. If ``model`` is not a
     76 :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs
   (...)
    345     model to the file ``f`` even if this is raised.
    346 """
    348 from torch.onnx import utils
--> 350 return utils.export(
    351     model,
    352     args,
    353     f,
    354     export_params,
    355     verbose,
    356     training,
    357     input_names,
    358     output_names,
    359     operator_export_type,
    360     opset_version,
    361     do_constant_folding,
    362     dynamic_axes,
    363     keep_initializers_as_inputs,
    364     custom_opsets,
    365     export_modules_as_functions,
    366 )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:163, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
    145 def export(
    146     model,
    147     args,
   (...)
    160     export_modules_as_functions=False,
    161 ):
--> 163     _export(
    164         model,
    165         args,
    166         f,
    167         export_params,
    168         verbose,
    169         training,
    170         input_names,
    171         output_names,
    172         operator_export_type=operator_export_type,
    173         opset_version=opset_version,
    174         do_constant_folding=do_constant_folding,
    175         dynamic_axes=dynamic_axes,
    176         keep_initializers_as_inputs=keep_initializers_as_inputs,
    177         custom_opsets=custom_opsets,
    178         export_modules_as_functions=export_modules_as_functions,
    179     )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:1074, in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size, custom_opsets, add_node_names, onnx_shape_inference, export_modules_as_functions)
   1071     dynamic_axes = {}
   1072 _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-> 1074 graph, params_dict, torch_out = _model_to_graph(
   1075     model,
   1076     args,
   1077     verbose,
   1078     input_names,
   1079     output_names,
   1080     operator_export_type,
   1081     val_do_constant_folding,
   1082     fixed_batch_size=fixed_batch_size,
   1083     training=training,
   1084     dynamic_axes=dynamic_axes,
   1085 )
   1087 # TODO: Don't allocate a in-memory string for the protobuf
   1088 defer_weight_export = (
   1089     export_type is not torch.onnx.ExportTypes.PROTOBUF_FILE
   1090 )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:727, in _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size, training, dynamic_axes)
    724     args = (args,)
    726 model = _pre_trace_quant_model(model, args)
--> 727 graph, params, torch_out, module = _create_jit_graph(model, args)
    728 params_dict = _get_named_param_dict(graph, params)
    730 try:

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:602, in _create_jit_graph(model, args)
    600     return graph, params, torch_out, None
    601 else:
--> 602     graph, torch_out = _trace_and_get_graph_from_model(model, args)
    603     _C._jit_pass_onnx_lint(graph)
    604     state_dict = torch.jit._unique_state_dict(model)

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:517, in _trace_and_get_graph_from_model(model, args)
    512 def _trace_and_get_graph_from_model(model, args):
    513     # A basic sanity check: make sure the state_dict keys are the same
    514     # before and after running the model.  Fail fast!
    515     orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
--> 517     trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
    518         model, args, strict=False, _force_outplace=False, _return_inputs_states=True
    519     )
    520     warn_on_static_input_change(inputs_states)
    522     if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():

File /usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py:1175, in _get_trace_graph(f, args, kwargs, strict, _force_outplace, return_inputs, _return_inputs_states)
   1173 if not isinstance(args, tuple):
   1174     args = (args,)
-> 1175 outs = ONNXTracedModule(f, strict, _force_outplace, return_inputs, _return_inputs_states)(*args, **kwargs)
   1176 return outs

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py:127, in ONNXTracedModule.forward(self, *args)
    124     else:
    125         return tuple(out_vars)
--> 127 graph, out = torch._C._create_graph_by_tracing(
    128     wrapper,
    129     in_vars + module_state,
    130     _create_interpreter_name_lookup_fn(),
    131     self.strict,
    132     self._force_outplace,
    133 )
    135 if self._return_inputs:
    136     return graph, outs[0], ret_inputs[0]

File /usr/local/lib/python3.8/dist-packages/torch/jit/_trace.py:118, in ONNXTracedModule.forward.<locals>.wrapper(*args)
    116 if self._return_inputs_states:
    117     inputs_states.append(_unflatten(in_args, in_desc))
--> 118 outs.append(self.inner(*trace_inputs))
    119 if self._return_inputs_states:
    120     inputs_states[0] = (inputs_states[0], trace_inputs)

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1118, in Module._slow_forward(self, *input, **kwargs)
   1116         recording_scopes = False
   1117 try:
-> 1118     result = self.forward(*input, **kwargs)
   1119 finally:
   1120     if recording_scopes:

File /usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py:1058, in GPT2LMHeadModel.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict)
   1050 r"""
   1051 labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
   1052     Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
   1053     `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
   1054     are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
   1055 """
   1056 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1058 transformer_outputs = self.transformer(
   1059     input_ids,
   1060     past_key_values=past_key_values,
   1061     attention_mask=attention_mask,
   1062     token_type_ids=token_type_ids,
   1063     position_ids=position_ids,
   1064     head_mask=head_mask,
   1065     inputs_embeds=inputs_embeds,
   1066     encoder_hidden_states=encoder_hidden_states,
   1067     encoder_attention_mask=encoder_attention_mask,
   1068     use_cache=use_cache,
   1069     output_attentions=output_attentions,
   1070     output_hidden_states=output_hidden_states,
   1071     return_dict=return_dict,
   1072 )
   1073 hidden_states = transformer_outputs[0]
   1075 # Set device for model parallelism

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.8/dist-packages/torch/nn/modules/module.py:1118, in Module._slow_forward(self, *input, **kwargs)
   1116         recording_scopes = False
   1117 try:
-> 1118     result = self.forward(*input, **kwargs)
   1119 finally:
   1120     if recording_scopes:

File /usr/local/lib/python3.8/dist-packages/transformers/models/gpt2/modeling_gpt2.py:801, in GPT2Model.forward(self, input_ids, past_key_values, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict)
    799     past_key_values = tuple([None] * len(self.h))
    800 else:
--> 801     past_length = past_key_values[0][0].size(-2)
    802 if position_ids is None:
    803     position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got -2)

Am I missing something? Please help @pommedeterresautee

pngmafia commented 2 years ago

Tried using

input_ids = {"input_ids": decoder_input_ids.cpu(),"past_key_values": None, "attention_mask": None, "token_type_ids": None,"position_ids": None,"head_mask":None, "inputs_embeds": None, "encoder_hidden_states": encoder_hidden_states.cpu()}
# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type
for k, v in input_ids.items():
    if not isinstance(v, torch.Tensor):
        continue
    if v.dtype in [torch.long, torch.int64]:
        input_ids[k] = v.type(torch.int32)

convert_to_onnx(
    model_pytorch=decoder_model,
    output_path="test-gpt2.onnx",
    inputs_pytorch=dict(input_ids),
    quantization=False,
    var_output_seq=True,
    output_names=["output"],  # we inform ONNX export tool that the output shape will vary with the input shape
)
# model may switch to train mode for some unknown reasons, we force the eval mode.
_ = decoder_model.eval()

The values are getting passed to the correct variables but theres an issue with dynamic sizes

Complete Error Stack:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In [27], line 50
     47     input_names = list(input_ids.keys())
     49 with torch.no_grad():
---> 50     torch.onnx.export(
     51         decoder_model,  # model to optimize
     52         args=tuple(input_ids.values()),  # tuple of multiple inputs
     53         f="test-gpt2.onnx",  # output path / file object
     54         opset_version=13,  # the ONNX version to use, >= 13 supports channel quantized model
     55         do_constant_folding=True,  # simplify model (replace constant expressions)
     56         input_names=input_names,  # input names
     57         output_names=["output"],  # output names
     58         dynamic_axes=dynamic_axis,  # declare dynamix axis for each input / output
     59         training=TrainingMode.EVAL,  # always put the model in evaluation mode
     60         verbose=False,
     61     )
     62 proto = onnx.load("test-gpt2.onnx", load_external_data=False)
     63 save_onnx(proto=proto, model_path="test-gpt2.onnx")

File /usr/local/lib/python3.8/dist-packages/torch/onnx/__init__.py:350, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
     74 r"""
     75 Exports a model into ONNX format. If ``model`` is not a
     76 :class:`torch.jit.ScriptModule` nor a :class:`torch.jit.ScriptFunction`, this runs
   (...)
    345     model to the file ``f`` even if this is raised.
    346 """
    348 from torch.onnx import utils
--> 350 return utils.export(
    351     model,
    352     args,
    353     f,
    354     export_params,
    355     verbose,
    356     training,
    357     input_names,
    358     output_names,
    359     operator_export_type,
    360     opset_version,
    361     do_constant_folding,
    362     dynamic_axes,
    363     keep_initializers_as_inputs,
    364     custom_opsets,
    365     export_modules_as_functions,
    366 )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:163, in export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, custom_opsets, export_modules_as_functions)
    145 def export(
    146     model,
    147     args,
   (...)
    160     export_modules_as_functions=False,
    161 ):
--> 163     _export(
    164         model,
    165         args,
    166         f,
    167         export_params,
    168         verbose,
    169         training,
    170         input_names,
    171         output_names,
    172         operator_export_type=operator_export_type,
    173         opset_version=opset_version,
    174         do_constant_folding=do_constant_folding,
    175         dynamic_axes=dynamic_axes,
    176         keep_initializers_as_inputs=keep_initializers_as_inputs,
    177         custom_opsets=custom_opsets,
    178         export_modules_as_functions=export_modules_as_functions,
    179     )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:1074, in _export(model, args, f, export_params, verbose, training, input_names, output_names, operator_export_type, export_type, opset_version, do_constant_folding, dynamic_axes, keep_initializers_as_inputs, fixed_batch_size, custom_opsets, add_node_names, onnx_shape_inference, export_modules_as_functions)
   1071     dynamic_axes = {}
   1072 _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
-> 1074 graph, params_dict, torch_out = _model_to_graph(
   1075     model,
   1076     args,
   1077     verbose,
   1078     input_names,
   1079     output_names,
   1080     operator_export_type,
   1081     val_do_constant_folding,
   1082     fixed_batch_size=fixed_batch_size,
   1083     training=training,
   1084     dynamic_axes=dynamic_axes,
   1085 )
   1087 # TODO: Don't allocate a in-memory string for the protobuf
   1088 defer_weight_export = (
   1089     export_type is not torch.onnx.ExportTypes.PROTOBUF_FILE
   1090 )

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:731, in _model_to_graph(model, args, verbose, input_names, output_names, operator_export_type, do_constant_folding, _disable_torch_constant_prop, fixed_batch_size, training, dynamic_axes)
    728 params_dict = _get_named_param_dict(graph, params)
    730 try:
--> 731     graph = _optimize_graph(
    732         graph,
    733         operator_export_type,
    734         _disable_torch_constant_prop=_disable_torch_constant_prop,
    735         fixed_batch_size=fixed_batch_size,
    736         params_dict=params_dict,
    737         dynamic_axes=dynamic_axes,
    738         input_names=input_names,
    739         module=module,
    740     )
    741 except Exception as e:
    742     torch.onnx.log("Torch IR graph at exception: ", graph)

File /usr/local/lib/python3.8/dist-packages/torch/onnx/utils.py:306, in _optimize_graph(graph, operator_export_type, _disable_torch_constant_prop, fixed_batch_size, params_dict, dynamic_axes, input_names, module)
    304     input_names = [] if input_names is None else input_names
    305     dynamic_axes = {} if dynamic_axes is None else dynamic_axes
--> 306     _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
    307 _C._jit_pass_onnx_lint(graph)
    308 graph = _C._jit_pass_onnx(graph, operator_export_type)

RuntimeError: Dynamic shape axis should be no more than the shape dimension for sequence
pngmafia commented 2 years ago

This script worked forexporting the model to onnx but its not working for optimizing the onnx graph

input_ids = {"input_ids": decoder_input_ids.cpu(),"encoder_hidden_states": encoder_hidden_states.cpu()}
# some inference engines don't support int64 tensor as inputs, we convert all input tensors to int32 type
for k, v in input_ids.items():
    if not isinstance(v, torch.Tensor):
        continue
    if v.dtype in [torch.long, torch.int64]:
        input_ids[k] = v.type(torch.int32)

if hasattr(decoder_model, "config") and hasattr(decoder_model.config, "use_cache"):
    use_cache = getattr(decoder_model.config, "use_cache")
    setattr(decoder_model.config, "use_cache", False)

dynamic_axis = dict()
for k in input_ids.keys():
    if True:
        # seq axis name is fixed to be matched with output seq axis name (for output shape prediction)
        dynamic_axis[k] = {0: "batch_size", 1: "sequence"}
    else:
        # if there is no specific requirement, each axis name is unique, fix some issue on T5 model
        dynamic_axis[k] = {0: "batch_size", 1: f"sequence-{k}"}

for output_name in ["output"]:
    dynamic_axis[output_name] = {0: "batch_size"}
    if True:
        dynamic_axis[output_name][1] = "sequence"

print(f"DYNAMIC AXIS: {dynamic_axis}")

# get input names in the same order as in the model forward
model_args = decoder_model.forward.__code__.co_varnames
input_names = []
for arg_name in model_args:
    if arg_name in input_ids.keys():
        input_names.append(arg_name)

# sentence transformer model forward is kargs and kwargs
if len(input_names) == 0:
    input_names = list(input_ids.keys())

with torch.no_grad():
    torch.onnx.export(
        decoder_model,  # model to optimize
#         args=tuple(input_ids.values()),  # tuple of multiple inputs
        args=input_ids,
        f="test-gpt2.onnx",  # output path / file object
        opset_version=13,  # the ONNX version to use, >= 13 supports channel quantized model
        do_constant_folding=True,  # simplify model (replace constant expressions)
        input_names=input_names,  # input names
        output_names=["output"],  # output names
        dynamic_axes=dynamic_axis,  # declare dynamix axis for each input / output
        training=TrainingMode.EVAL,  # always put the model in evaluation mode
        verbose=False,
    )
proto = onnx.load("test-gpt2.onnx", load_external_data=False)
save_onnx(proto=proto, model_path="test-gpt2.onnx")
if False:
    TensorQuantizer.use_fb_fake_quant = False
if hasattr(decoder_model, "config") and hasattr(decoder_model.config, "use_cache"):
    setattr(decoder_model.config, "use_cache", use_cache)

But this isn't working

logging.basicConfig()
logging.getLogger().setLevel(logging.INFO)
num_attention_heads, hidden_size = get_model_size(path=DECODER_MODEL)
optimize_onnx(
    onnx_path="test-gpt2.onnx",
    onnx_optim_model_path="test-gpt2-opt.onnx",
    fp16=True,
    use_cuda=True,
    num_attention_heads=num_attention_heads,
    hidden_size=hidden_size,
    architecture="gpt2",
)
INFO:fusion_utils:Removed 26 Cast nodes with output type same as input
INFO:fusion_base:Fused LayerNormalization count: 19
INFO:fusion_base:Fused FastGelu count: 6
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In [61], line 4
      2 logging.getLogger().setLevel(logging.INFO)
      3 num_attention_heads, hidden_size = get_model_size(path=DECODER_MODEL)
----> 4 optimize_onnx(
      5     onnx_path="test-gpt2.onnx",
      6     onnx_optim_model_path="test-gpt2-opt.onnx",
      7     fp16=True,
      8     use_cuda=True,
      9     num_attention_heads=num_attention_heads,
     10     hidden_size=hidden_size,
     11     architecture="gpt2",
     12 )

File /usr/local/lib/python3.8/dist-packages/transformer_deploy/backends/ort_utils.py:117, in optimize_onnx(onnx_path, onnx_optim_model_path, fp16, use_cuda, num_attention_heads, hidden_size, architecture)
    115     architecture = "bert"
    116 opt_level = 1 if architecture == "bert" else 0
--> 117 optimized_model: BertOnnxModel = optimizer.optimize_model(
    118     input=onnx_path,
    119     model_type=architecture,
    120     use_gpu=use_cuda,
    121     opt_level=opt_level,
    122     num_heads=num_attention_heads,  # automatic detection with 0 may not work with opset 13 or distilbert models
    123     hidden_size=hidden_size,  # automatic detection with 0
    124     optimization_options=optimization_options,
    125 )
    126 if fp16:
    127     # use_symbolic_shape_infer set to false because doesn't work after ONNX package v1.10.2
    128     optimized_model.convert_float_to_float16(use_symbolic_shape_infer=False)  # FP32 -> FP16

File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/optimizer.py:253, in optimize_model(input, model_type, num_heads, hidden_size, optimization_options, opt_level, use_gpu, only_onnxruntime)
    251     optimizer = optimizer_class(model, num_heads, hidden_size)
    252 else:
--> 253     optimizer = optimize_by_fusion(model, model_type, num_heads, hidden_size, optimization_options)
    255 # Remove the temporary model.
    256 if temp_model_path:

File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/optimizer.py:153, in optimize_by_fusion(model, model_type, num_heads, hidden_size, optimization_options)
    149     optimization_options = FusionOptions(model_type)
    151 optimizer = optimizer_class(model, num_heads, hidden_size)
--> 153 optimizer.optimize(optimization_options)
    155 optimizer.topological_sort()
    157 optimizer.model.producer_name = "onnxruntime.transformers"

File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../onnx_model_bert.py:360, in BertOnnxModel.optimize(self, options, add_dynamic_axes)
    358     if options is not None:
    359         self.attention_mask.set_mask_format(options.attention_mask_format)
--> 360     self.fuse_attention()
    362 self.fuse_shape()
    364 if (options is None) or options.enable_embed_layer_norm:

File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../onnx_model_gpt2.py:23, in Gpt2OnnxModel.fuse_attention(self)
     21 if len(self.model.graph.input) == 1 or len(self.model.graph.output) == 1:
     22     fusion = FusionGptAttentionNoPast(self, self.num_heads)
---> 23     fusion.apply()
     24 else:
     25     fusion = FusionGptAttention(self, self.num_heads)

File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../fusion_base.py:46, in Fusion.apply(self)
     44             raise Exception("Can not find node in any graphs")
     45         self.this_graph_name = graph.name
---> 46         self.fuse(node, input_name_to_nodes, output_name_to_node)
     48 op_list = [node.op_type for node in self.nodes_to_add]
     49 count = max(self.fused_count, op_list.count(self.fused_op_type))

File /usr/local/lib/python3.8/dist-packages/onnxruntime/transformers/models/gpt2/../../fusion_gpt_attention_no_past.py:104, in FusionGptAttentionNoPast.fuse(self, normalize_node, input_name_to_nodes, output_name_to_node)
    102 layernorm_before_attention = self.model.get_parent(reshape_before_gemm, 0, output_name_to_node)
    103 if layernorm_before_attention is None or layernorm_before_attention.op_type != "LayerNormalization":
--> 104     if layernorm_before_attention.op_type != "Add":
    105         logger.debug(f"failed to get layernorm before gemm. Got {layernorm_before_attention.op_type}")
    106         return

AttributeError: 'NoneType' object has no attribute 'op_type'

@pommedeterresautee can you please help

ayoub-louati commented 2 years ago

@pngmafia Hello, can you please post the full script that you are trying to test ?