vllm-project / vllm

A high-throughput and memory-efficient inference and serving engine for LLMs
https://docs.vllm.ai
Apache License 2.0
26.76k stars 3.92k forks source link

[Bug]: Phi3 lora module not loading #4915

Open arunpatala opened 3 months ago

arunpatala commented 3 months ago

ValueError: While loading /data/llm_resume_profiles_phi3_v1, expected target modules in ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'embed_tokens', 'lm_head'] but received ['qkv_proj', 'gate_up_proj']. Please verify that the loaded LoRA module is correct.

I am unable to load LORA module with phi3-128k-instruct version. Can support for this be added? I am using VLLM docker with version 0.4.2

thanks

WeiXiaoSummer commented 3 months ago

yes, you can refer to #4715 for a conversation script. However phi3 128k contains also a gateup_proj, you can modify the script to decompose gate_up_proj layer as well. Like following:

import torch
import os
import json
import fire
import shutil

from safetensors.torch import load_file, save_file

def replicate_lora_a_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    prefix, suffix = name.split('qkv_proj')
    res = {}
    for t in ['q_proj', 'k_proj', 'v_proj']:
        name = f"{prefix}{t}{suffix}"
        res[name] = weight.clone()
    return res

def replicate_lora_a_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    prefix, suffix = name.split('gate_up_proj')
    res = {}
    for t in ['gate_proj', 'up_proj']:
        name = f"{prefix}{t}{suffix}"
        res[name] = weight.clone()
    return res

def split_lora_b_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    size = weight.shape[0] // 3
    prefix, suffix = name.split('qkv_proj')
    res = {
        f"{prefix}{t}{suffix}": w
        for t, w in zip(['q_proj', 'k_proj', 'v_proj'], weight.split(size))
    }
    return res

def split_lora_b_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    size = weight.shape[0] // 2
    prefix, suffix = name.split('gate_up_proj')
    res = {
        f"{prefix}{t}{suffix}": w
        for t, w in zip(['gate_proj', 'up_proj'], weight.split(size))
    }
    return res

def convert_qkv_gate_up_lora_to_splits_vllm(adapter_folder_path: str, output_folder_path: str) -> None:
    """return the new adapter dict"""

    adapter_bin_name = 'adapter_model.safetensors'
    adapter_config_name = 'adapter_config.json'

    lora = load_file(f"{adapter_folder_path}/{adapter_bin_name}")
    with open(f"{adapter_folder_path}/{adapter_config_name}", 'r') as f:
        lora_config = json.load(f)

    assert 'qkv_proj' in lora_config['target_modules']
    assert 'gate_up_proj' in lora_config['target_modules']

    # converting weights
    res = {}
    for k, v in lora.items():
        if 'qkv_proj' in k and 'lora_A' in k:
            res.update(replicate_lora_a_qkv(k, v))
        elif 'qkv_proj' in k and 'lora_B' in k:
            res.update(split_lora_b_qkv(k, v))
        elif 'gate_up_proj' in k and 'lora_A' in k:
            res.update(replicate_lora_a_gate_up(k, v))
        elif 'gate_up_proj' in k and 'lora_B' in k:
            res.update(split_lora_b_gate_up(k, v))
        else:
            res[k] = v

    # converting config
    temp = ['q_proj', 'k_proj', 'v_proj', 'gate_proj', 'up_proj'] + [t for t in lora_config['target_modules'] if t != 'qkv_proj' and t != 'gate_up_proj']
    lora_config['target_modules'] = temp

    # saving
    os.makedirs(output_folder_path, exist_ok=True)
    save_file(res, f"{output_folder_path}/{adapter_bin_name}", metadata={"format": "pt"})
    with open(f"{output_folder_path}/{adapter_config_name}", 'w') as f:
        json.dump(lora_config, f, indent=4)

    for file in os.listdir(adapter_folder_path):
        if file != adapter_bin_name and file != adapter_config_name:
            shutil.copy(f"{adapter_folder_path}/{file}", f"{output_folder_path}/{file}")

if __name__ == "__main__":
    fire.Fire(convert_qkv_gate_up_lora_to_splits_vllm)

It can load it without error, but not sure if there will be any performance issue.

arunpatala commented 3 months ago

Thanks a lot. I will use this, but will VLLM be adding support for this in the future?

SHIMURA0 commented 3 months ago

yes, you can refer to #4715 for a conversation script. However phi3 128k contains also a gateup_proj, you can modify the script to decompose gate_up_proj layer as well. Like following:

import torch
import os
import json
import fire
import shutil

from safetensors.torch import load_file, save_file

def replicate_lora_a_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    prefix, suffix = name.split('qkv_proj')
    res = {}
    for t in ['q_proj', 'k_proj', 'v_proj']:
        name = f"{prefix}{t}{suffix}"
        res[name] = weight.clone()
    return res

def replicate_lora_a_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    prefix, suffix = name.split('gate_up_proj')
    res = {}
    for t in ['gate_proj', 'up_proj']:
        name = f"{prefix}{t}{suffix}"
        res[name] = weight.clone()
    return res

def split_lora_b_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    size = weight.shape[0] // 3
    prefix, suffix = name.split('qkv_proj')
    res = {
        f"{prefix}{t}{suffix}": w
        for t, w in zip(['q_proj', 'k_proj', 'v_proj'], weight.split(size))
    }
    return res

def split_lora_b_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    size = weight.shape[0] // 2
    prefix, suffix = name.split('gate_up_proj')
    res = {
        f"{prefix}{t}{suffix}": w
        for t, w in zip(['gate_proj', 'up_proj'], weight.split(size))
    }
    return res

def convert_qkv_gate_up_lora_to_splits_vllm(adapter_folder_path: str, output_folder_path: str) -> None:
    """return the new adapter dict"""

    adapter_bin_name = 'adapter_model.safetensors'
    adapter_config_name = 'adapter_config.json'

    lora = load_file(f"{adapter_folder_path}/{adapter_bin_name}")
    with open(f"{adapter_folder_path}/{adapter_config_name}", 'r') as f:
        lora_config = json.load(f)

    assert 'qkv_proj' in lora_config['target_modules']
    assert 'gate_up_proj' in lora_config['target_modules']

    # converting weights
    res = {}
    for k, v in lora.items():
        if 'qkv_proj' in k and 'lora_A' in k:
            res.update(replicate_lora_a_qkv(k, v))
        elif 'qkv_proj' in k and 'lora_B' in k:
            res.update(split_lora_b_qkv(k, v))
        elif 'gate_up_proj' in k and 'lora_A' in k:
            res.update(replicate_lora_a_gate_up(k, v))
        elif 'gate_up_proj' in k and 'lora_B' in k:
            res.update(split_lora_b_gate_up(k, v))
        else:
            res[k] = v

    # converting config
    temp = ['q_proj', 'k_proj', 'v_proj', 'gate_proj', 'up_proj'] + [t for t in lora_config['target_modules'] if t != 'qkv_proj' and t != 'gate_up_proj']
    lora_config['target_modules'] = temp

    # saving
    os.makedirs(output_folder_path, exist_ok=True)
    save_file(res, f"{output_folder_path}/{adapter_bin_name}", metadata={"format": "pt"})
    with open(f"{output_folder_path}/{adapter_config_name}", 'w') as f:
        json.dump(lora_config, f, indent=4)

    for file in os.listdir(adapter_folder_path):
        if file != adapter_bin_name and file != adapter_config_name:
            shutil.copy(f"{adapter_folder_path}/{file}", f"{output_folder_path}/{file}")

if __name__ == "__main__":
    fire.Fire(convert_qkv_gate_up_lora_to_splits_vllm)

It can load it without error, but not sure if there will be any performance issue.

After decomposing layers of Phi3, will there be any problem merging LoRA layers back into Phi3? Do we have to merge the decomposed layers back before append LoRA back onto the original Phi3?

KaranChand commented 3 months ago

I added the new projection layers using the conversion code above and the standard vLLM code for LoRA using snapshot_download using their documentation (https://docs.vllm.ai/en/latest/models/lora.html#using-lora-adapters). It did work without errors, though performance dropped massively compared to the same model (with merged LoRA) without vLLM. I used the same tokenizer for both models. I have not found out yet why this is the case, might just be a bug in my own code somewhere

Update: There was a bug in my code, the script above works just fine and the LoRa weights are merged well

WeiXiaoSummer commented 3 months ago

Thanks a lot. I will use this, but will VLLM be adding support for this in the future?

I guess they will, otherwise we will always need to convert the lora first, which is not very convenient.

WeiXiaoSummer commented 3 months ago

yes, you can refer to #4715 for a conversation script. However phi3 128k contains also a gateup_proj, you can modify the script to decompose gate_up_proj layer as well. Like following:

import torch
import os
import json
import fire
import shutil

from safetensors.torch import load_file, save_file

def replicate_lora_a_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    prefix, suffix = name.split('qkv_proj')
    res = {}
    for t in ['q_proj', 'k_proj', 'v_proj']:
        name = f"{prefix}{t}{suffix}"
        res[name] = weight.clone()
    return res

def replicate_lora_a_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    prefix, suffix = name.split('gate_up_proj')
    res = {}
    for t in ['gate_proj', 'up_proj']:
        name = f"{prefix}{t}{suffix}"
        res[name] = weight.clone()
    return res

def split_lora_b_qkv(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    size = weight.shape[0] // 3
    prefix, suffix = name.split('qkv_proj')
    res = {
        f"{prefix}{t}{suffix}": w
        for t, w in zip(['q_proj', 'k_proj', 'v_proj'], weight.split(size))
    }
    return res

def split_lora_b_gate_up(name: str, weight: "torch.Tensor") -> dict[str, "torch.Tensor"]:
    size = weight.shape[0] // 2
    prefix, suffix = name.split('gate_up_proj')
    res = {
        f"{prefix}{t}{suffix}": w
        for t, w in zip(['gate_proj', 'up_proj'], weight.split(size))
    }
    return res

def convert_qkv_gate_up_lora_to_splits_vllm(adapter_folder_path: str, output_folder_path: str) -> None:
    """return the new adapter dict"""

    adapter_bin_name = 'adapter_model.safetensors'
    adapter_config_name = 'adapter_config.json'

    lora = load_file(f"{adapter_folder_path}/{adapter_bin_name}")
    with open(f"{adapter_folder_path}/{adapter_config_name}", 'r') as f:
        lora_config = json.load(f)

    assert 'qkv_proj' in lora_config['target_modules']
    assert 'gate_up_proj' in lora_config['target_modules']

    # converting weights
    res = {}
    for k, v in lora.items():
        if 'qkv_proj' in k and 'lora_A' in k:
            res.update(replicate_lora_a_qkv(k, v))
        elif 'qkv_proj' in k and 'lora_B' in k:
            res.update(split_lora_b_qkv(k, v))
        elif 'gate_up_proj' in k and 'lora_A' in k:
            res.update(replicate_lora_a_gate_up(k, v))
        elif 'gate_up_proj' in k and 'lora_B' in k:
            res.update(split_lora_b_gate_up(k, v))
        else:
            res[k] = v

    # converting config
    temp = ['q_proj', 'k_proj', 'v_proj', 'gate_proj', 'up_proj'] + [t for t in lora_config['target_modules'] if t != 'qkv_proj' and t != 'gate_up_proj']
    lora_config['target_modules'] = temp

    # saving
    os.makedirs(output_folder_path, exist_ok=True)
    save_file(res, f"{output_folder_path}/{adapter_bin_name}", metadata={"format": "pt"})
    with open(f"{output_folder_path}/{adapter_config_name}", 'w') as f:
        json.dump(lora_config, f, indent=4)

    for file in os.listdir(adapter_folder_path):
        if file != adapter_bin_name and file != adapter_config_name:
            shutil.copy(f"{adapter_folder_path}/{file}", f"{output_folder_path}/{file}")

if __name__ == "__main__":
    fire.Fire(convert_qkv_gate_up_lora_to_splits_vllm)

It can load it without error, but not sure if there will be any performance issue.

After decomposing layers of Phi3, will there be any problem merging LoRA layers back into Phi3? Do we have to merge the decomposed layers back before append LoRA back onto the original Phi3?

I think you can directly merge the original (not decomposed) LORA adapter to Phi3 if you're not trying to load LORA via vllm.

knight-fork commented 2 weeks ago

I added the new projection layers using the conversion code above and the standard vLLM code for LoRA using snapshot_download using their documentation (https://docs.vllm.ai/en/latest/models/lora.html#using-lora-adapters). It did work without errors, though performance dropped massively compared to the same model (with merged LoRA) without vLLM. I used the same tokenizer for both models. I have not found out yet why this is the case, might just be a bug in my own code somewhere

Update: There was a bug in my code, the script above works just fine and the LoRa weights are merged well

what is the bug? I am also facing the same thing . my performance dropped heavily.