microsoft / Olive

Olive: Simplify ML Model Finetuning, Conversion, Quantization, and Optimization for CPUs, GPUs and NPUs.
https://microsoft.github.io/Olive/
MIT License
1.58k stars 165 forks source link

Missing implementation error for CoreML #1299

Open thewh1teagle opened 2 months ago

thewh1teagle commented 2 months ago

Describe the bug When running optimized whisper medium int8 on macOS coreml I get the error

CreateSession(Msg("Could not find an implementation for DecoderMaskedMultiHeadAttention(1) node with name 'Attention_0'"))

To Reproduce Enable CoreML and use medium.int8 gpu model from https://huggingface.co/thewh1teagle/whisper-olive/tree/main

Expected behavior It should run without error

Olive config

{
    "input_model": {
        "type": "CompositeModel",
        "model_component_names": [
            "encoder_decoder_init",
            "decoder"
        ],
        "model_components": [
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_encoder_decoder_init",
                "io_config": "get_encdec_io_config",
                "dummy_inputs_func": "encoder_decoder_init_dummy_inputs"
            },
            {
                "type": "PyTorchModel",
                "model_path": "openai/whisper-medium",
                "model_script": "code/user_script.py",
                "script_dir": "code",
                "model_loader": "get_decoder",
                "io_config": "get_dec_io_config",
                "dummy_inputs_func": "decoder_dummy_inputs"
            }
        ],
        "model_attributes": {
            "vocab_size": 51865,
            "num_mel_bins": 80,
            "d_model": 1024,
            "encoder_layers": 24,
            "encoder_attention_heads": 16,
            "decoder_layers": 24,
            "decoder_attention_heads": 16,
            "decoder_ffn_dim": 4096,
            "encoder_ffn_dim": 4096,
            "dropout": 0.0,
            "attention_dropout": 0.0,
            "activation_dropout": 0.0,
            "activation_function": "gelu",
            "init_std": 0.02,
            "encoder_layerdrop": 0.0,
            "decoder_layerdrop": 0.0,
            "use_cache": true,
            "num_hidden_layers": 24,
            "scale_embedding": false,
            "max_source_positions": 1500,
            "max_target_positions": 448,
            "classifier_proj_size": 256,
            "use_weighted_layer_sum": false,
            "apply_spec_augment": false,
            "mask_time_prob": 0.05,
            "mask_time_length": 10,
            "mask_time_min_masks": 2,
            "mask_feature_prob": 0.0,
            "mask_feature_length": 10,
            "mask_feature_min_masks": 0,
            "median_filter_width": 7,
            "return_dict": true,
            "output_hidden_states": false,
            "output_attentions": false,
            "torchscript": false,
            "torch_dtype": "float32",
            "use_bfloat16": false,
            "tf_legacy_loss": false,
            "pruned_heads": {},
            "tie_word_embeddings": true,
            "chunk_size_feed_forward": 0,
            "is_encoder_decoder": true,
            "is_decoder": false,
            "cross_attention_hidden_size": null,
            "add_cross_attention": false,
            "tie_encoder_decoder": false,
            "max_length": 448,
            "min_length": 0,
            "do_sample": false,
            "early_stopping": false,
            "num_beams": 1,
            "num_beam_groups": 1,
            "diversity_penalty": 0.0,
            "temperature": 1.0,
            "top_k": 50,
            "top_p": 1.0,
            "typical_p": 1.0,
            "repetition_penalty": 1.0,
            "length_penalty": 1.0,
            "no_repeat_ngram_size": 0,
            "encoder_no_repeat_ngram_size": 0,
            "bad_words_ids": null,
            "num_return_sequences": 1,
            "output_scores": false,
            "return_dict_in_generate": false,
            "forced_bos_token_id": null,
            "forced_eos_token_id": null,
            "remove_invalid_values": false,
            "exponential_decay_length_penalty": null,
            "begin_suppress_tokens": [
                220,
                50257
            ],
            "architectures": [
                "WhisperForConditionalGeneration"
            ],
            "finetuning_task": null,
            "id2label": {
                "0": "LABEL_0",
                "1": "LABEL_1"
            },
            "label2id": {
                "LABEL_0": 0,
                "LABEL_1": 1
            },
            "tokenizer_class": null,
            "prefix": null,
            "bos_token_id": 50257,
            "pad_token_id": 50257,
            "eos_token_id": 50257,
            "sep_token_id": null,
            "decoder_start_token_id": 50258,
            "task_specific_params": null,
            "problem_type": null,
            "_name_or_path": "openai/whisper-medium",
            "transformers_version": "4.42.4",
            "forced_decoder_ids": [
                [
                    1,
                    50259
                ],
                [
                    2,
                    50359
                ],
                [
                    3,
                    50363
                ]
            ],
            "model_type": "whisper"
        }
    },
    "systems": {
        "local_system": {
            "type": "LocalSystem",
            "accelerators": [
                {
                    "device": "gpu",
                    "execution_providers": [
                        "CUDAExecutionProvider"
                    ]
                }
            ]
        }
    },
    "data_configs": [
        {
            "name": "latency_data_config",
            "user_script": "code/user_script.py",
            "script_dir": "code",
            "load_dataset_config": {
                "type": "whisper_dataset",
                "data_dir": "data",
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "dataloader_config": {
                "type": "no_auto_batch_dataloader"
            }
        }
    ],
    "evaluators": {
        "common_evaluator": {
            "metrics": [
                {
                    "name": "latency",
                    "type": "latency",
                    "sub_types": [
                        {
                            "name": "avg",
                            "priority": 1
                        }
                    ],
                    "data_config": "latency_data_config"
                }
            ]
        }
    },
    "passes": {
        "conversion": {
            "type": "OnnxConversion",
            "target_opset": 17
        },
        "transformers_optimization": {
            "type": "OrtTransformersOptimization",
            "optimization_options": {
                "use_multi_head_attention": true
            },
            "use_gpu": true
        },
        "onnx_dynamic_quantization": {
            "type": "OnnxDynamicQuantization",
            "per_channel": false,
            "reduce_range": false,
            "op_types_to_quantize": [
                "MatMul",
                "Gemm",
                "Gather"
            ],
            "MatMulConstBOnly": false
        },
        "insert_beam_search": {
            "type": "InsertBeamSearch",
            "use_forced_decoder_ids": true,
            "use_logits_processor": false,
            "fp16": false
        },
        "prepost": {
            "type": "AppendPrePostProcessingOps",
            "tool_command": "whisper",
            "tool_command_args": {
                "model_name": "openai/whisper-medium",
                "use_audio_decoder": true
            },
            "target_opset": 17
        }
    },
    "log_severity_level": 0,
    "host": "local_system",
    "target": "local_system",
    "evaluator": "common_evaluator",
    "evaluate_input_model": false,
    "clean_cache": false,
    "cache_dir": "cache",
    "output_dir": "models",
    "output_name": "whisper_gpu_int8"
}

Olive logs Add logs here.

Other information

Additional context Related: https://github.com/microsoft/Olive/issues/1213 Optimized on ubuntu server with cuda

jambayk commented 2 months ago

The whisper example has only been optimized and tested for cpu and cuda ep. It uses multiple onnxruntime custom operators that have not been implemented on other EPs.