KeyError: 'base_model.model.model.layers.0.mlp.down_proj.lora_A.weight'

Iven2132 commented 2 months ago

Hi, I'm trying to fine-tune the Llama3.1 8b model but after fine-tuning it uploading it to HF, and when trying to run it using vLLM I get this error "KeyError: 'base_model.model.model.layers.0.mlp.down_proj.lora_A.weight'" Can anyone help me please?

Here is my fine-tuning script:

     from unsloth import FastLanguageModel
        import torch
        from datasets import load_dataset

        from trl import SFTTrainer
        from transformers import TrainingArguments
        from unsloth import is_bfloat16_supported

        max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
        dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
        load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

        # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
        fourbit_models = [
            "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
            "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
            "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
            "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
            "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
            "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
            "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
            "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
            "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
            "unsloth/Phi-3-medium-4k-instruct",
            "unsloth/gemma-2-9b-bnb-4bit",
            "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
        ] # More models at https://huggingface.co/unsloth

        model, tokenizer = FastLanguageModel.from_pretrained(
            model_name = "unsloth/Meta-Llama-3.1-8B",
            max_seq_length = max_seq_length,
            dtype = dtype,
            load_in_4bit = load_in_4bit
        )

        model = FastLanguageModel.get_peft_model(
        model,
        r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 16,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )
        alpaca_prompt = """You're helpful assistant

        ### Instruction:
        {}

        ### Input:
        {}

        ### Response:
        {}"""

        EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
        def formatting_prompts_func(examples):
            instructions = examples["instruction"]
            inputs       = examples["input"]
            outputs      = examples["output"]
            texts = []
            for instruction, input, output in zip(instructions, inputs, outputs):
                # Must add EOS_TOKEN, otherwise your generation will go on forever!
                text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
                texts.append(text)
            return { "text" : texts, }
        pass

        dataset = load_dataset("myname/datasetname", split = "train", token="......")
        dataset = dataset.map(formatting_prompts_func, batched = True)

        trainer = SFTTrainer(
            model = model,
            tokenizer = tokenizer,
            train_dataset = dataset,
            dataset_text_field = "text",
            max_seq_length = max_seq_length,
            dataset_num_proc = 2,
            packing = False, # Can make training 5x faster for short sequences.
            args = TrainingArguments(
                per_device_train_batch_size = 2,
                gradient_accumulation_steps = 4,
                warmup_steps = 5,
                # num_train_epochs = 1, # Set this for 1 full training run.
                max_steps = 60,
                learning_rate = 2e-4,
                fp16 = not is_bfloat16_supported(),
                bf16 = is_bfloat16_supported(),
                logging_steps = 1,
                optim = "adamw_8bit",
                weight_decay = 0.01,
                lr_scheduler_type = "linear",
                seed = 3407,
                output_dir = "outputs",
            ),
        )
        trainer_stats = trainer.train()
        model.save_pretrained("lora_model") 
        tokenizer.save_pretrained("lora_model")
        model.push_to_hub("myname/finetunedllama", token = ".....") # Online saving
        tokenizer.push_to_hub("myname/finetunedllama", token = "....") # Online saving

And here is my vLLM script:

     from vllm import LLM, SamplingParams

        llm = LLM(model="myname/finetunedllama")

        sampling_params = SamplingParams(
            temperature=0.8,
            max_tokens=500
        )

        outputs = llm.generate(["Hey"], sampling_params)
        print(outputs)

danielhanchen commented 2 months ago

@Iven2132 Apologies on the delay - you're saving a LoRA adapter, so you need to call vLLM with a LoRA, so it's a bit different

Iven2132 commented 2 months ago

@danielhanchen Can you give me an example?

danielhanchen commented 1 month ago

@Iven2132 Sorry on the delay!! https://docs.vllm.ai/en/latest/models/lora.html should be useful

SoMinesawa commented 3 weeks ago

Try adding the LLM options

quantization=“bitsandbytes”, 
load_format=“bitsandbytes”.

Source: https://docs.vllm.ai/en/v0.6.0/quantization/bnb.html

unslothai / unsloth

KeyError: 'base_model.model.model.layers.0.mlp.down_proj.lora_A.weight' #960