huggingface / peft

🤗 PEFT: State-of-the-art Parameter-Efficient Fine-Tuning.
https://huggingface.co/docs/peft
Apache License 2.0
16.61k stars 1.65k forks source link

quantize open-Flamingo error : init reverted #723

Closed YerongLi closed 1 year ago

YerongLi commented 1 year ago

System Info

https://github.com/open-mmlab/Multimodal-GPT

Are there any good ways to quantize open-flamingo? I found after using prepare_model_for_kbit_training the flamingo_init() is reverted

Here is a modified https://github.com/open-mmlab/Multimodal-GPT/blob/main/mmgpt/models/open_flamingo/builder.py

### 
"""Modified from https://github.com/mlfoundations/open_flamingo"""
import open_clip
import torch
import torch.nn as nn
from bigmodelvis import Visualization
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import LlamaForCausalLM, LlamaTokenizer,BitsAndBytesConfig
# from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig

from .flamingo import Flamingo
from .flamingo_lm import FlamingoLMMixin
from .utils import extend_instance

DEBUG = 1
def create_model_and_transforms(
    clip_vision_encoder_path: str,
    clip_vision_encoder_pretrained: str,
    lang_encoder_path: str,
    tokenizer_path: str,
    decoder_layers_attr_name: str = None,
    pretrained_model_path: str = None,
    tuning_config=None,
    **flamingo_kwargs,
):
    """
    Initialize a Flamingo model from a pretrained vision encoder and language encoder.
    Appends special tokens to the tokenizer and freezes backbones.

    Args:
        clip_vision_encoder_path (str): path to pretrained clip model (e.g. "ViT-B-32")
        clip_vision_encoder_pretrained (str): name of pretraining dataset for clip model (e.g. "laion2b_s32b_b79k")
        lang_encoder_path (str): path to pretrained language encoder
        tokenizer_path (str): path to pretrained tokenizer
        decoder_layers_attr_name (str, optional): name of the decoder layers attribute. Defaults to None.
    Returns:
        Flamingo: Flamingo model from pretrained vision and language encoders
        Image processor: Pipeline to preprocess input images
        Tokenizer: A tokenizer for the language model
    """
    print("init clip vision encoder")
    vision_encoder, _, image_processor = open_clip.create_model_and_transforms(
        clip_vision_encoder_path, pretrained=clip_vision_encoder_pretrained
    )
    # set the vision encoder to output the visual features
    vision_encoder.visual.output_tokens = True
    print("init tokenizer")
    text_tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path)
    # add Flamingo special tokens to the tokenizer
    text_tokenizer.add_special_tokens({"additional_special_tokens": ["<|endofchunk|>", "<image>"]})
    if text_tokenizer.pad_token is None:
        # Issue: GPT models don't have a pad token, which we use to
        # modify labels for the loss.
        text_tokenizer.add_special_tokens({"pad_token": "<PAD>"})
    text_tokenizer.bos_token_id = 1
    text_tokenizer.eos_token_id = 2

    print("init llama")
    quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    load_in_8bit = False ,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
    )
    # lang_encoder = LlamaForCausalLM.from_pretrained(lang_encoder_path)
    if DEBUG == 1:
        lang_encoder = LlamaForCausalLM.from_pretrained(lang_encoder_path, quantization_config=quantization_config)
        # lang_encoder = prepare_model_for_kbit_training(lang_encoder)
    else: 
        lang_encoder = LlamaForCausalLM.from_pretrained(lang_encoder_path)

    extend_instance(lang_encoder, FlamingoLMMixin)

    if decoder_layers_attr_name is None:
        decoder_layers_attr_name = _infer_decoder_layers_attr_name(lang_encoder)
    lang_encoder.set_decoder_layers_attr_name(decoder_layers_attr_name)
    lang_encoder.resize_token_embeddings(len(text_tokenizer))

    model = Flamingo(
        vision_encoder,
        lang_encoder,
        text_tokenizer.encode("<|endofchunk|>")[-1],
        text_tokenizer.encode("<image>")[-1],
        vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["width"],
        cross_attn_every_n_layers=4,
        **flamingo_kwargs,
    )

    if pretrained_model_path is not None:
        print(f"loading pretrained model from {pretrained_model_path}")
        model.load_state_dict(torch.load(pretrained_model_path), strict=False)

    # Freeze all parameters
    model.requires_grad_(False)
    assert sum(p.numel() for p in model.parameters() if p.requires_grad) == 0

    if tuning_config is not None:
        if DEBUG == 1: 
            model = prepare_model_for_kbit_training(model)
            model.restart(
            vision_encoder,
            lang_encoder,
            text_tokenizer.encode("<|endofchunk|>")[-1],
            text_tokenizer.encode("<image>")[-1],
            vis_dim=open_clip.get_model_config(clip_vision_encoder_path)["vision_cfg"]["width"],
            cross_attn_every_n_layers=4,
            **flamingo_kwargs,
            )

        model = prepare_model_for_tuning(model, tuning_config)
    else:
        raise ValueError("tuning_config must be provided")

    print(
        f"Flamingo model initialized with {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable parameters"
    )

    # if DEBUG == 1: model = prepare_model_for_kbit_training(model)

    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

    # Example usage
    num_params = count_parameters(model)
    print(f" ======= Number of trainable parameters: {num_params}")

    return model, image_processor, text_tokenizer

def _infer_decoder_layers_attr_name(model):
    for k in __KNOWN_DECODER_LAYERS_ATTR_NAMES:
        if k.lower() in model.__class__.__name__.lower():
            return __KNOWN_DECODER_LAYERS_ATTR_NAMES[k]

    raise ValueError(
        f"We require the attribute name for the nn.ModuleList in the decoder storing the transformer block layers. Please supply this string manually."
    )

__KNOWN_DECODER_LAYERS_ATTR_NAMES = {
    "opt": "model.decoder.layers",
    "gptneo": "transformer.h",
    "gptj": "transformer.h",
    "gpt-j": "transformer.h",
    "pythia": "gpt_neox.layers",
    "llama": "model.layers",
}

def prepare_model_for_tuning(model: nn.Module, config):
    if config.lora:
        lora_config = LoraConfig(
            r=config.lora_r,
            lora_alpha=config.lora_alpha,
            target_modules=config.lora_target_modules,
            lora_dropout=config.lora_dropout,
            bias="none",  # won't use bias currently
            modules_to_save=[],  # TODO: might be helpful if save partial model
            task_type="VL",
        )
        model.lang_encoder = get_peft_model(model.lang_encoder, peft_config=lora_config)

    # manually unfreeze modules, we use a `substring` fashion mathcing
    for name, param in model.named_parameters():
        if any(substr in name for substr in config.unfrozen):
            param.requires_grad = True

    if config.vis and is_rank0():
        Visualization(model).structure_graph()
    return model

# temporary workaround, should use a common utils in the future
def is_rank0():
    if not torch.distributed.is_initialized():
        return True
    return torch.distributed.get_rank() == 0

Who can help?

No response

Information

Tasks

Reproduction

language_datasets = [ dict( type="alpaca_gpt4", ann_path="data/alpaca_gpt4/alpaca_gpt4_data.json", ),

]

- Download data LLaVA and Alpaca GPT4 according to the official readme
4. [LlaVA](https://llava-vl.github.io/)

    Download from [liuhaotian/LLaVA-Instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) and place in `data/llava/`.

7. [Alpaca GPT4](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)

    Download it from [this link](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM/raw/main/data/alpaca_gpt4_data.json) and place it in `data/alpaca_gpt4/alpaca_gpt4_data.json`.

- Replace the following file with provided code builder.py:  https://github.com/open-mmlab/Multimodal-GPT/blob/main/mmgpt/models/open_flamingo/builder.py
-  Run

torchrun --nproc_per_node=1 mmgpt/train/instruction_finetune.py \ --lm_path /scratch/yerong/.cache/pyllama/hf/7B \ --tokenizer_path /scratch/yerong/.cache/pyllama/hf/7B \ --pretrained_path OpenFlamingo-9B/checkpoint.pt \ --run_name train-my-gpt4 \ --learning_rate 1e-5 \ --lr_scheduler cosine \ --batch_size 1\ --tuning_config configs/lora_config.py \ --dataset_config configs/dataset_config.py


   https://github.com/open-mmlab/Multimodal-GPT/blob/main/mmgpt/models/open_flamingo/builder.py

### Expected behavior

Total training steps: 99883
Found no checkpoints for run train-my-gpt4.
0%| | 0/99883 [00:00<?, ?it/s]/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None
warnings.warn("None of the inputs have requires_grad=True. Gradients will be None")
0%| | 0/99883 [00:01<?, ?it/s]
Traceback (most recent call last):
File "/scratch/yerong/Multimodal-GPT/mmgpt/train/instruction_finetune.py", line 470, in
main()
File "/scratch/yerong/Multimodal-GPT/mmgpt/train/instruction_finetune.py", line 302, in main
train_one_epoch(
File "/scratch/yerong/Multimodal-GPT/mmgpt/train/instruction_finetune.py", line 390, in train_one_epoch
loss_batch = model(
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
output = self._run_ddp_forward(*inputs, *kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return module_to_run(
inputs[0],
kwargs[0]) # type: ignore[index]
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/scratch/yerong/Multimodal-GPT/mmgpt/models/open_flamingo/flamingo.py", line 104, in forward
output = self.lang_encoder(
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, *kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/peft/peft_model.py", line 416, in forward
return self.get_base_model()(
args,
kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, *kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 688, in forward
outputs = self.model(
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(
args,
kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, kwargs)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 570, in forward
layer_outputs = torch.utils.checkpoint.checkpoint(
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 249, in checkpoint
return CheckpointFunction.apply(function, preserve, args) File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(
args,
kwargs) # type: ignore[misc] File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/utils/checkpoint.py", line 107, in forward
outputs = run_function(args)
File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/transformers/models/llama/modeling_llama.py", line 566, in custom_forward
return module(
inputs, output_attentions, None) File "/scratch/yerong/.conda/envs/mmgpt/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
TypeError: forward() takes from 2 to 3 positional arguments but 7 were given

YerongLi commented 1 year ago

In short are there any ways to directly quantize this model?

class Flamingo(nn.Module):
    def __init__(
        self,
        vision_encoder: nn.Module,
        lang_encoder: nn.Module,
        eoc_token_id: int,
        media_token_id: int,
        vis_dim: int,
        cross_attn_every_n_layers: int = 1,
        gradient_checkpointing: bool = False,
    ):
        """
        Args:
            vision_encoder (nn.Module): HF CLIPModel
            lang_encoder (nn.Module): HF causal language model
            eoc_token_id (int): Token id for <|endofchunk|>
            media_token_id (int): Token id for <image>
            vis_dim (int): Dimension of the visual features.
                Visual features are projected to match this shape along the last dimension.
            cross_attn_every_n_layers (int, optional): How often to apply cross attention after transformer layer. Defaults to 1.
        """
        super().__init__()
        self.eoc_token_id = eoc_token_id
        self.media_token_id = media_token_id
        self.vis_dim = vis_dim
        if hasattr(lang_encoder.config, "d_model"):
            self.lang_dim = lang_encoder.config.d_model  # mpt uses d_model
        else:
            self.lang_dim = lang_encoder.config.hidden_size

        self.vision_encoder = vision_encoder.visual
        self.perceiver = PerceiverResampler(dim=self.vis_dim)
        self.lang_encoder = lang_encoder
        self.lang_encoder.init_flamingo(
            media_token_id=media_token_id,
            lang_hidden_size=self.lang_dim,
            vis_hidden_size=self.vis_dim,
            cross_attn_every_n_layers=cross_attn_every_n_layers,
            gradient_checkpointing=gradient_checkpointing,
        )
        self._use_gradient_checkpointing = gradient_checkpointing
        self.perceiver._use_gradient_checkpointing = gradient_checkpointing
github-actions[bot] commented 1 year ago

This issue has been automatically marked as stale because it has not had recent activity. If you think this still needs to be addressed please comment on this thread.