Add support for Musicgen Melody in the ONNX export

rubeniskov commented 1 week ago

Feature request

Support Musicgen Melody's ONNX exportation with audio prompting.

Motivation

Currently, Optimum do not support export for Musicgen Melody models, The current implementation in Transformers already supports this model (musicgen-melody) through its specific configuration in configuration_musicgen_melody, but a configuration is missing to fully integrate it into Optimum. ONNX export of musicgen-melody with "audio prompting" would enable audio and text-conditioned music generation, which is essential for advanced music editing and generation applications.

optimum-cli export onnx --model facebook/musicgen-melody musicgen_melody_onnx/

https://github.com/huggingface/transformers/blob/main/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py

Your contribution

I’ll attempt to adapt the current configuration for Musicgen to work with musicgen-melody, adding the audio_decoder dummy generator and configuring the necessary inputs. However, I have limited knowledge in this area, especially around setting up the dummy generator and managing the inputs for the audio encoder. Any guidance or examples would be greatly appreciated.

gabotechs commented 1 week ago

+1, it would be great to export Musicgen Melody models into ONNX

rubeniskov commented 1 week ago

As far as I see the configuration is pretty similar but with the exception of two new parameters "chroma_length" and "num_chroma" and the removed "is_encoder_decoder" and "classifier_dropout"

diff --git a/MusicgenConfig.txt b/MusicgenMelodyConfig.txt
index 6392f0c..24ac168 100644
--- a/MusicgenConfig.txt
+++ b/MusicgenMelodyConfig.txt
@@ -1,8 +1,8 @@
 {
   "_attn_implementation_autoset": true,
-  "_name_or_path": "facebook/musicgen-small",
+  "_name_or_path": "facebook/musicgen-melody",
   "architectures": [
-    "MusicgenForConditionalGeneration"
+    "MusicgenMelodyForConditionalGeneration"
   ],
   "audio_encoder": {
     "_attn_implementation_autoset": false,
@@ -100,6 +100,7 @@
     "use_causal_conv": false,
     "use_conv_shortcut": false
   },
+  "chroma_length": 235,
   "decoder": {
     "_attn_implementation_autoset": false,
     "_name_or_path": "",
@@ -113,7 +114,6 @@
     "begin_suppress_tokens": null,
     "bos_token_id": 2048,
     "chunk_size_feed_forward": 0,
-    "classifier_dropout": 0,
     "cross_attention_hidden_size": null,
     "decoder_start_token_id": null,
     "diversity_penalty": 0,
@@ -123,11 +123,11 @@
     "encoder_no_repeat_ngram_size": 0,
     "eos_token_id": null,
     "exponential_decay_length_penalty": null,
-    "ffn_dim": 4096,
+    "ffn_dim": 6144,
     "finetuning_task": null,
     "forced_bos_token_id": null,
     "forced_eos_token_id": null,
-    "hidden_size": 1024,
+    "hidden_size": 1536,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
@@ -144,13 +144,13 @@
     "max_length": 20,
     "max_position_embeddings": 2048,
     "min_length": 0,
-    "model_type": "musicgen_decoder",
+    "model_type": "musicgen_melody_decoder",
     "no_repeat_ngram_size": 0,
-    "num_attention_heads": 16,
+    "num_attention_heads": 24,
     "num_beam_groups": 1,
     "num_beams": 1,
     "num_codebooks": 4,
-    "num_hidden_layers": 24,
+    "num_hidden_layers": 48,
     "num_return_sequences": 1,
     "output_attentions": false,
     "output_hidden_states": false,
@@ -181,8 +181,8 @@
     "use_cache": true,
     "vocab_size": 2048
   },
-  "is_encoder_decoder": true,
-  "model_type": "musicgen",
+  "model_type": "musicgen_melody",
+  "num_chroma": 12,
   "text_encoder": {
     "_attn_implementation_autoset": false,
     "_name_or_path": "t5-base",

huggingface / optimum