unslothai / unsloth

Finetune Llama 3.2, Mistral, Phi, Qwen 2.5 & Gemma LLMs 2-5x faster with 80% less memory
https://unsloth.ai
Apache License 2.0
18.3k stars 1.28k forks source link

model loading failed: RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM: size mismatch for base_model.model.model.embed_tokens.weight #943

Open BoyangGu1 opened 3 months ago

BoyangGu1 commented 3 months ago

I used the following code to sft llama3:

import os
import wandb
os.environ["WANDB_PROJECT"] = "unsloth-mimic-20240814"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints
from utils import mem_report

from unsloth import FastLanguageModel
import torch
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

from prepare_datadict import load_datadict
from time import time

def add_special_token(token: str, description: str, tokenizer, model):
    tokenizer.add_special_tokens({'additional_special_tokens':[token]})
    model.resize_token_embeddings(len(tokenizer))

    # Get the index of the new token
    new_token_id = tokenizer.convert_tokens_to_ids(token)

    # Tokenize the text
    encoded_dict = tokenizer(description, return_tensors='pt')
    # Get the special token IDs
    special_tokens = tokenizer.all_special_ids
    # Filter out the special token IDs
    filtered_input_ids = [token_id for token_id in encoded_dict['input_ids'].squeeze().tolist() if token_id not in special_tokens]

    with torch.no_grad():
        # Initialize the new embedding
        embeddings_layer = model.get_input_embeddings()
        new_embedding = 0
        for token_id in filtered_input_ids:
            new_embedding += embeddings_layer.weight[token_id]
        new_embedding /= len(filtered_input_ids)

        # Update the embedding matrix of the model
        embeddings_layer.weight[new_token_id] = new_embedding

def get_formatting_prompts_func(EOS_TOKEN: str, prompt: str):
    def formatting_prompts_func(examples):
        documents = examples["DOCUMENT"]
        summaries = examples["SUMMARY"]
        texts = []
        for document, summary in zip(documents, summaries):
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            text = prompt.format(document, summary) + EOS_TOKEN
            texts.append(text)
        return {"text": texts}
    return formatting_prompts_func

def get_datadict_formatting_prompts_func(EOS_TOKEN: str, prompt: str):
    def datadict_formatting_prompts_func(document: str, summary: str) -> str:
        text = prompt.format(document, summary) + EOS_TOKEN
        return text
    return datadict_formatting_prompts_func

def main():
    max_seq_length = 32768 # Choose any! We auto support RoPE Scaling internally!
    dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "unsloth/Meta-Llama-3.1-8B",
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    add_special_token("DOC_SEP", "seperation of two documents", tokenizer, model)

    model = FastLanguageModel.get_peft_model(
        model,
        r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )

    response_template = "the following is the golden brief hospital course section:"
    response_template_ids = tokenizer.encode(response_template, add_special_tokens=False)[2:]
    collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

    EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
    prompt = "extract the hospital courses of the following reports and summarize a brief hospital course section: {}\n the following is the golden brief hospital course section: {}"

    datadict_formatting_prompts_func = get_datadict_formatting_prompts_func(EOS_TOKEN, prompt)
    data_dict = load_datadict(mimic_version="mimic-iii",
                        heading_type="hpc",
                        tokenizer=tokenizer,
                        tokenizer_name="Meta-Llama-3.1-8B-with-DOC_SEP-true",
                        max_token_length=32768,
                        prompt_func=datadict_formatting_prompts_func,
                        prompt_name="hpc1",
                        prompt=prompt,
                        ratio=(0.9, 0.1),)

    run_name = "unsloth_32768_lora64_batch128_epoch1_with_DOC_SEP"
    output_dir = os.path.join("outputs", run_name)
    os.makedirs(output_dir, exist_ok=True)
    args = TrainingArguments(
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=128,
            warmup_ratio = 0.03,
            num_train_epochs = 1, # Set this for 1 full training run.
            eval_strategy="epoch",
            save_strategy="epoch",
            # save_strategy="steps",
            # save_steps=100,
            group_by_length=True,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="cosine",
            seed=3407,
            max_grad_norm=0.3,
            output_dir=output_dir,
            report_to="wandb",
            run_name=run_name,
        )

    formatting_prompts_func = get_formatting_prompts_func(EOS_TOKEN, prompt)
    train_dataset = data_dict["train"].map(formatting_prompts_func, batched=True)
    eval_dataset = data_dict["val"].map(formatting_prompts_func, batched=True)
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        dataset_text_field="text",
        data_collator=collator,
        max_seq_length=max_seq_length,
        dataset_num_proc=1,
        packing=False, # Can make training 5x faster for short sequences.
        args=args,
    )

    mem_report()
    start_time = time()
    # trainer_stats = trainer.train(resume_from_checkpoint=True)
    trainer_stats = trainer.train()
    end_time = time()
    duration = end_time - start_time
    print(f"duration: {duration}")
    mem_report()

    model.save_pretrained("lora_model" + run_name) # Local saving
    tokenizer.save_pretrained("lora_model" + run_name)

if __name__ == "__main__":
    main()

Especially, I added a new special token DOC_SEP, then when I try to load the model, it crushed:

from unsloth import FastLanguageModel

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/data/bob_files/MIMIC_project_data/lora_model_unsloth_32768_lora64_batch128_epoch1_with_DOC_SEP", # YOUR MODEL YOU USED FOR TRAINING
    max_seq_length = 32768,
    dtype = None,
    load_in_4bit = True,
)
FastLanguageModel.for_inference(model)

I got the error:

Traceback (most recent call last):
  File "/data/bob_files/save_unslot.py", line 4, in <module>
    model, tokenizer = FastLanguageModel.from_pretrained(
  File "/data/bob_files/miniconda3/envs/vllm_env/lib/python3.10/site-packages/unsloth/models/loader.py", line 352, in from_pretrained
    model = PeftModel.from_pretrained(
  File "/data/bob_files/miniconda3/envs/vllm_env/lib/python3.10/site-packages/peft/peft_model.py", line 545, in from_pretrained
    model.load_adapter(
  File "/data/bob_files/miniconda3/envs/vllm_env/lib/python3.10/site-packages/peft/peft_model.py", line 1117, in load_adapter
    load_result = set_peft_model_state_dict(
  File "/data/bob_files/miniconda3/envs/vllm_env/lib/python3.10/site-packages/peft/utils/save_and_load.py", line 395, in set_peft_model_state_dict
    load_result = model.load_state_dict(peft_model_state_dict, strict=False)
  File "/data/bob_files/miniconda3/envs/vllm_env/lib/python3.10/site-packages/torch/nn/modules/module.py", line 2215, in load_state_dict
    raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
        size mismatch for base_model.model.model.embed_tokens.weight: copying a param with shape torch.Size([128257, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).
        size mismatch for base_model.model.lm_head.weight: copying a param with shape torch.Size([128257, 4096]) from checkpoint, the shape in current model is torch.Size([128256, 4096]).

It seems that the when model is saved, it failed to save with 128257 tokens, but stick with the old 128256 token version (without DOC_SEP).

Please help!

BoyangGu1 commented 3 months ago

It seems that the saved model is fine, but when combining with the original model, the original model (of course without DOC_SEP) failed to merge with lora. I don't know how to fix that now.

BoyangGu1 commented 3 months ago

@danielhanchen

danielhanchen commented 2 months ago

Hmmm will try investigating this

wuzihaoo commented 2 months ago

Try resize_model_vocab = 128257 in FastLanguageModel.from_pretrained()

Astroa7m commented 2 months ago

Try resize_model_vocab = 128257 in FastLanguageModel.from_pretrained()

Worked perfectly after applying my base model vocab size with the resize_model_vocab param