⚠️⚠️ Quantization of finetuned Qwen2-VL 7B model using AutoAWQ fails with OOM regardless of number of GPUs.

mehamednews commented 2 weeks ago

I fine tuned qwen2-vl 7b using ms-swift, I'm now trying to quantize it. I tried using ms-swift itself which failed then I followed the guide on your repo to write a custom script. Even with 8xA100 (80GB) it still OOMs (with just 8 samples). not sure why this is happening, any help would be appreciated!

here's the script I'm using (I'm converting the ms-swift jsonl format to the required format):

import json
from pathlib import Path
from typing import List, Dict, Any
from transformers import Qwen2VLProcessor
from awq.models.qwen2vl import Qwen2VLAWQForConditionalGeneration
from qwen_vl_utils import process_vision_info

def load_jsonl(file_path: str) -> List[Dict[str, Any]]:
    """Load data from a JSONL file."""
    with open(file_path, "r", encoding="utf-8") as f:
        return [json.loads(line) for line in f]

def prepare_dataset(dataset_file: str, n_samples: int = 2) -> List[List[Dict]]:
    dataset = load_jsonl(dataset_file)
    dataset = dataset[:n_samples]

    formatted_data = []
    for item in dataset:
        # Create image content list for multiple image      s
        image_content = []
        for image_path in item["images"]:
            image_content.append({"type": "image", "image": f"file://output/{image_path}"})  # Assuming local file path

        # Add the text query after the images
        image_content.append({"type": "text", "text": item["query"]})

        formatted_message = [{"role": "user", "content": image_content}, {"role": "assistant", "content": item["response"]}]
        formatted_data.append(formatted_message)

    return formatted_data

def main():
    # Configuration
    model_path = "./qwen2-vl-7b-instruct/v0-20241102-150323/checkpoint-660"
    quant_path = "./qwen2-vl-7b-instruct/checkpoint-660-awq"
    dataset_file = "./label-dataset-train.jsonl"

    # Quantization config
    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}

    print("Loading processor and model...")
    processor = Qwen2VLProcessor.from_pretrained(model_path)

    model = Qwen2VLAWQForConditionalGeneration.from_pretrained(
        model_path,
        model_type="qwen2_vl",
        use_cache=False,
    )

    print("Preparing dataset...")
    dataset = prepare_dataset(dataset_file)

    print("Processing inputs...")
    text = processor.apply_chat_template(dataset, tokenize=False, add_generation_prompt=True)

    # Process vision info (handles multiple images per message)
    image_inputs, video_inputs = process_vision_info(dataset)

    inputs = processor(text=text, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")

    print("Starting quantization...")
    model.quantize(calib_data=inputs, quant_config=quant_config)

    print("Saving quantized model...")
    # Enable use_cache for inference
    model.model.config.use_cache = model.model.generation_config.use_cache = True
    model.save_quantized(quant_path, safetensors=True, shard_size="4GB")
    processor.save_pretrained(quant_path)

    print(f"Quantization complete! Model saved to: {quant_path}")

if __name__ == "__main__":
    main()

lebronjamesking commented 2 weeks ago

I have same problem, so weird

mehamednews commented 1 week ago

@kq-chen hope you guys can give us a hint on how to handle this 🙏

QwenLM / Qwen2-VL

⚠️⚠️ Quantization of finetuned Qwen2-VL 7B model using AutoAWQ fails with OOM regardless of number of GPUs. #521