princeton-nlp / LLM-Shearing

[ICLR 2024] Sheared LLaMA: Accelerating Language Model Pre-training via Structured Pruning
https://arxiv.org/abs/2310.06694
MIT License
533 stars 39 forks source link

Composer Model Transform problems encountered when shearing Pythia 1.4b #19

Closed Longyichen closed 9 months ago

Longyichen commented 9 months ago

Pythia helps study the effects of pruning at different LM scales. Noticed that you provided composer_pythia.py, so I tried running this experiment on 1.4b scale pythia. I'm having some problems.

The first is the writing of the pythia_to_composer conversion function for several settings I made. The keymap here is what I inferred based on your composer_pythia.py

def get_gpt_key_map_from_hf_to_composer(num_layers):
    """ get the keymap from hf to composer """
    key_map = {}
    key_map.update({"gpt_neox.embed_in.weight": "model.transformer.wte.weight",
                    "gpt_neox.final_layer_norm.weight": "model.transformer.ln_f.weight",
                    "gpt_neox.final_layer_norm.bias": "model.transformer.ln_f.bias",
                    "embed_out.weight": "model.transformer.output.weight",
                    "embed_out.bias": "model.transformer.output.bias"})
    for i in range(num_layers):
        key_map.update({
                        f"gpt_neox.layers.{i}.input_layernorm.weight": f"model.transformer.blocks.{i}.ln_1.weight",
                        f"gpt_neox.layers.{i}.post_attention_layernorm.weight": f"model.transformer.blocks.{i}.ln_2.weight",
                        f"gpt_neox.layers.{i}.attention.query_key_value.weight": f"model.transformer.blocks.{i}.attn.query_key_value.weight",
                        f"gpt_neox.layers.{i}.attention.dense.weight": f"model.transformer.blocks.{i}.attn.out_proj.weight",
                        f"gpt_neox.layers.{i}.mlp.dense_h_to_4h.weight": f"model.transformer.blocks.{i}.mlp.up_proj.weight",
                        f"gpt_neox.layers.{i}.mlp.dense_4h_to_h.weight": f"model.transformer.blocks.{i}.mlp.down_proj.weight",
                        f"gpt_neox.layers.{i}.input_layernorm.bias": f"model.transformer.blocks.{i}.ln_1.bias",
                        f"gpt_neox.layers.{i}.post_attention_layernorm.bias": f"model.transformer.blocks.{i}.ln_2.bias",
                        f"gpt_neox.layers.{i}.attention.query_key_value.bias": f"model.transformer.blocks.{i}.attn.query_key_value.bias",
                        f"gpt_neox.layers.{i}.attention.dense.bias": f"model.transformer.blocks.{i}.attn.out_proj.bias",
                        f"gpt_neox.layers.{i}.mlp.dense_h_to_4h.bias": f"model.transformer.blocks.{i}.mlp.up_proj.bias",
                        f"gpt_neox.layers.{i}.mlp.dense_4h_to_h.bias": f"model.transformer.blocks.{i}.mlp.down_proj.bias",
                        f"gpt_neox.layers.{i}.attention.rotary_emb.inv_freq": f"model.transformer.blocks.{i}.attn.rotary_emb.inv_freq",
                       })
    return key_map

In test_composer_hf_equal.py

def construct_example_cfg(model_size, path=None, add_l0_module=False):
    """ construct example cfg for mosaicml llama models """
    if model_size == "1.4b":
        cfg = om.create({"name": "mosaic_pythia_1", "init_device": "cpu", "d_model": 2048, 
                         "n_heads": 16, "n_layers": 24, "intermediate_size": 8192,
                         "rotary_pct": 0.25, "rotary_emb_base": 10000
                         })

    # add default values
    cfg = om.merge(cfg, om.create({"max_seq_len": 2048, "vocab_size": 50304, "init_std": 0.02, "attn_pdrop": 0.0, "resid_pdrop": 0.0, "emb_pdrop": 0.0, "attn_impl": "norm", "layer_norm_eps": 1e-5}))
    if add_l0_module:
        cfg["l0_module"] = {"start_sparsity": 0, "target_sparsity": 0.6, "pruning_modules": ["head", "head_layer", "mlp", "intermediate", "hidden"], "lagrangian_warmup_steps": "320ba"}
    return cfg

final result

(Pdb) composer_model
ComposerMosaicPythia(
  (model): PythiaModel(
    (transformer): ModuleDict(
      (wte): PythiaEmbedding(50304, 2048)
      (blocks): ModuleList(
        (0-23): 24 x PythiaBlock(
          (ln_1): CoFiLayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (attn): PythiaAttention(
            (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
            (out_proj): Linear(in_features=2048, out_features=2048, bias=True)
            (rotary_emb): RotaryEmbedding()
          )
          (ln_2): CoFiLayerNorm((2048,), eps=1e-05, elementwise_affine=True)
          (mlp): PythiaMLP(
            (down_proj): Linear(in_features=8192, out_features=2048, bias=True)
            (up_proj): Linear(in_features=2048, out_features=8192, bias=True)
          )
        )
      )
      (output): Linear(in_features=2048, out_features=50304, bias=False)
      (ln_f): CoFiLayerNorm((2048,), eps=1e-05, elementwise_affine=True)
    )
  )
)
(Pdb) hf_model
GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2048)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          (act): GELUActivation()
        )
      )
    )
    (final_layer_norm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  )
  (embed_out): Linear(in_features=2048, out_features=50304, bias=False)
)

The structure of the two models looks very different. I don't know if this is an issue with the structural organization in composer_pythia.py. Finally, I tested the output of the two models, and there was a huge difference.

Will you disclose Python-related experimental code? Do you need me to contribute relevant code?

xiamengzhou commented 9 months ago

Hi! Our initial development was done for Pythia series of models and we pruned from Pythia-410m to a 160M model with the same recipe we did for Llama models, and we created a better Sheared-Pythia-160m.

I didn't have the time and bandwidth to sort out that part of the code for now -- but would appreciate it if you could work through the pipeline! I am considering adding support to Pythia and Mistral for the following weeks.

rzr002 commented 5 months ago

I've encountered the same issue; I tried to modify the Pythia model to a Composer model by imitating your llama rewrite, and although it runs successfully, the output logits are quite different. Could you please expedite the updates for Pythia or Mistral? @xiamengzhou