Open thewh1teagle opened 2 months ago
Describe the bug When running optimized whisper medium int8 on macOS coreml I get the error
CreateSession(Msg("Could not find an implementation for DecoderMaskedMultiHeadAttention(1) node with name 'Attention_0'"))
To Reproduce Enable CoreML and use medium.int8 gpu model from https://huggingface.co/thewh1teagle/whisper-olive/tree/main
Expected behavior It should run without error
Olive config
{ "input_model": { "type": "CompositeModel", "model_component_names": [ "encoder_decoder_init", "decoder" ], "model_components": [ { "type": "PyTorchModel", "model_path": "openai/whisper-medium", "model_script": "code/user_script.py", "script_dir": "code", "model_loader": "get_encoder_decoder_init", "io_config": "get_encdec_io_config", "dummy_inputs_func": "encoder_decoder_init_dummy_inputs" }, { "type": "PyTorchModel", "model_path": "openai/whisper-medium", "model_script": "code/user_script.py", "script_dir": "code", "model_loader": "get_decoder", "io_config": "get_dec_io_config", "dummy_inputs_func": "decoder_dummy_inputs" } ], "model_attributes": { "vocab_size": 51865, "num_mel_bins": 80, "d_model": 1024, "encoder_layers": 24, "encoder_attention_heads": 16, "decoder_layers": 24, "decoder_attention_heads": 16, "decoder_ffn_dim": 4096, "encoder_ffn_dim": 4096, "dropout": 0.0, "attention_dropout": 0.0, "activation_dropout": 0.0, "activation_function": "gelu", "init_std": 0.02, "encoder_layerdrop": 0.0, "decoder_layerdrop": 0.0, "use_cache": true, "num_hidden_layers": 24, "scale_embedding": false, "max_source_positions": 1500, "max_target_positions": 448, "classifier_proj_size": 256, "use_weighted_layer_sum": false, "apply_spec_augment": false, "mask_time_prob": 0.05, "mask_time_length": 10, "mask_time_min_masks": 2, "mask_feature_prob": 0.0, "mask_feature_length": 10, "mask_feature_min_masks": 0, "median_filter_width": 7, "return_dict": true, "output_hidden_states": false, "output_attentions": false, "torchscript": false, "torch_dtype": "float32", "use_bfloat16": false, "tf_legacy_loss": false, "pruned_heads": {}, "tie_word_embeddings": true, "chunk_size_feed_forward": 0, "is_encoder_decoder": true, "is_decoder": false, "cross_attention_hidden_size": null, "add_cross_attention": false, "tie_encoder_decoder": false, "max_length": 448, "min_length": 0, "do_sample": false, "early_stopping": false, "num_beams": 1, "num_beam_groups": 1, "diversity_penalty": 0.0, "temperature": 1.0, "top_k": 50, "top_p": 1.0, "typical_p": 1.0, "repetition_penalty": 1.0, "length_penalty": 1.0, "no_repeat_ngram_size": 0, "encoder_no_repeat_ngram_size": 0, "bad_words_ids": null, "num_return_sequences": 1, "output_scores": false, "return_dict_in_generate": false, "forced_bos_token_id": null, "forced_eos_token_id": null, "remove_invalid_values": false, "exponential_decay_length_penalty": null, "begin_suppress_tokens": [ 220, 50257 ], "architectures": [ "WhisperForConditionalGeneration" ], "finetuning_task": null, "id2label": { "0": "LABEL_0", "1": "LABEL_1" }, "label2id": { "LABEL_0": 0, "LABEL_1": 1 }, "tokenizer_class": null, "prefix": null, "bos_token_id": 50257, "pad_token_id": 50257, "eos_token_id": 50257, "sep_token_id": null, "decoder_start_token_id": 50258, "task_specific_params": null, "problem_type": null, "_name_or_path": "openai/whisper-medium", "transformers_version": "4.42.4", "forced_decoder_ids": [ [ 1, 50259 ], [ 2, 50359 ], [ 3, 50363 ] ], "model_type": "whisper" } }, "systems": { "local_system": { "type": "LocalSystem", "accelerators": [ { "device": "gpu", "execution_providers": [ "CUDAExecutionProvider" ] } ] } }, "data_configs": [ { "name": "latency_data_config", "user_script": "code/user_script.py", "script_dir": "code", "load_dataset_config": { "type": "whisper_dataset", "data_dir": "data", "model_name": "openai/whisper-medium", "use_audio_decoder": true }, "dataloader_config": { "type": "no_auto_batch_dataloader" } } ], "evaluators": { "common_evaluator": { "metrics": [ { "name": "latency", "type": "latency", "sub_types": [ { "name": "avg", "priority": 1 } ], "data_config": "latency_data_config" } ] } }, "passes": { "conversion": { "type": "OnnxConversion", "target_opset": 17 }, "transformers_optimization": { "type": "OrtTransformersOptimization", "optimization_options": { "use_multi_head_attention": true }, "use_gpu": true }, "onnx_dynamic_quantization": { "type": "OnnxDynamicQuantization", "per_channel": false, "reduce_range": false, "op_types_to_quantize": [ "MatMul", "Gemm", "Gather" ], "MatMulConstBOnly": false }, "insert_beam_search": { "type": "InsertBeamSearch", "use_forced_decoder_ids": true, "use_logits_processor": false, "fp16": false }, "prepost": { "type": "AppendPrePostProcessingOps", "tool_command": "whisper", "tool_command_args": { "model_name": "openai/whisper-medium", "use_audio_decoder": true }, "target_opset": 17 } }, "log_severity_level": 0, "host": "local_system", "target": "local_system", "evaluator": "common_evaluator", "evaluate_input_model": false, "clean_cache": false, "cache_dir": "cache", "output_dir": "models", "output_name": "whisper_gpu_int8" }
Olive logs Add logs here.
Other information
Additional context Related: https://github.com/microsoft/Olive/issues/1213 Optimized on ubuntu server with cuda
The whisper example has only been optimized and tested for cpu and cuda ep. It uses multiple onnxruntime custom operators that have not been implemented on other EPs.
Describe the bug When running optimized whisper medium int8 on macOS coreml I get the error
To Reproduce Enable CoreML and use medium.int8 gpu model from https://huggingface.co/thewh1teagle/whisper-olive/tree/main
Expected behavior It should run without error
Olive config
Olive logs Add logs here.
Other information
Additional context Related: https://github.com/microsoft/Olive/issues/1213 Optimized on ubuntu server with cuda