arcee-ai / mergekit

Tools for merging pretrained large language models.
GNU Lesser General Public License v3.0
4.88k stars 446 forks source link

Support for Phi-3-Small [Feature ?] #396

Open hammoudhasan opened 3 months ago

hammoudhasan commented 3 months ago

It seems that Phi-3-Small models do not use Phi3ForCausalLM but rather use Phi3SmallForCausalLM. I tried coding up a config for merging the blocks but it didn't work properly on my end. I attached below the one I had prepared:

{
    "model_type": "phi",
    "architectures": [
        "Phi3SmallForCausalLM"
    ],
    "pre_weights": [
        {
            "name": "model.embed_tokens.weight",
            "is_embed": true
        }
    ],
    "post_weights": [
        {
            "name": "model.final_layernorm.weight",
            "is_embed": false
        },
        {
            "name": "model.final_layernorm.bias",
            "is_embed": false
        }
    ],
    "num_layers_config_key": "num_hidden_layers",
    "layer_templates": {
        "weights": [
            {
                "name": "model.layers.${layer_index}.input_layernorm.weight",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.input_layernorm.bias",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.post_attention_layernorm.weight",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.post_attention_layernorm.bias",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.self_attn.query_key_value.weight",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.self_attn.query_key_value.bias",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.self_attn.dense.weight",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.self_attn.dense.bias",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.mlp.up_proj.weight",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.mlp.up_proj.bias",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.mlp.down_proj.weight",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.mlp.down_proj.bias",
                "is_embed": false
            },
            {
                "name": "model.layers.${layer_index}.self_attn.rotary_emb.inv_freq",
                "is_embed": false
            }
        ]
    },
    "rotary_emb_config": {
        "requires_position_ids": true
    },
    "blocksparse_config": {
        "layers": [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28],
        "class_name": "BlockSparseAttentionLayer"
    },
    "embedding_dropout": 0.1,
    "num_hidden_layers": 30,
    "hidden_size": 4096,
    "vocab_size": 100352
}