Support for moe model？

laoda513 commented 5 months ago

Hi，

is there any plan to support the recently released moe models？ or any suggestion how to do it myself？

This project is quite lightweight and easy to use

johnsmith0031 commented 5 months ago

I think it supports most of the models with llama arch, but may have different names in transformers.

You can define a new function to load customized models like load_llama_model_4bit_low_ram in autograd_4bit.py A simple example is like:

def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''):
    if type(module) in layers:
        return {name: module}
    res = {}
    for name1, child in module.named_children():
        res.update(find_layers(
            child, layers=layers, name=name + '.' + name1 if name != '' else name1
        ))
    return res

def load_model(config_path, model_path, groupsize=-1, half=False, device_map="auto", seqlen=2048, is_v1_model=False, bits=4):
    import accelerate
    from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer

    print(Style.BRIGHT + Fore.CYAN + "Loading Model ...")
    t0 = time.time()

    with accelerate.init_empty_weights():
        config = LlamaConfig.from_pretrained(config_path)
        model = LlamaForCausalLM(config)
        model = model.eval()
        layers = find_layers(model)
        for name in ['lm_head']:
            if name in layers:
                del layers[name]
        make_quant_for_4bit_autograd(model, layers, groupsize=groupsize, is_v1_model=is_v1_model, bits=bits)
    model = accelerate.load_checkpoint_and_dispatch(
        model=model,
        checkpoint=model_path,
        device_map=device_map,
        no_split_module_classes=["LlamaDecoderLayer"]
    )

    model.seqlen = seqlen

    if half:
        model_to_half(model)

    try:
        tokenizer = LlamaTokenizer.from_pretrained(config_path)
    except HFValidationError as e:
        tokenizer = LlamaTokenizer.from_pretrained(model)
    tokenizer.truncation_side = 'left'

    print(Style.BRIGHT + Fore.GREEN + f"Loaded the model in {(time.time()-t0):.2f} seconds.")

    return model, tokenizer

Also, you can change the modules you want to train in finetune.py

# Config Lora
lora_config = LoraConfig(
    r=ft_config.lora_r,
    lora_alpha=ft_config.lora_alpha,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=ft_config.lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)

laoda513 commented 4 months ago

thanks！

johnsmith0031 / alpaca_lora_4bit

Support for moe model？ #156