vllm-project / vllm

A high-throughput and memory-efficient inference and serving engine for LLMs
https://docs.vllm.ai
Apache License 2.0
31.04k stars 4.72k forks source link

[New Model]: We can able to run phi-3.5 vision instruct model but wanted to run in int4 quantization #8463

Open thalapandi opened 2 months ago

thalapandi commented 2 months ago

The model to consider.

from typing import List
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from PIL import Image
from vllm import LLM, SamplingParams
import os
import uvicorn
import time

app = FastAPI()

class InferenceRequest(BaseModel):
    model: str
    question: str
    image_paths: List[str]

# Initialize models once during application startup
models = {}

def load_image_from_path(image_path: str) -> Image.Image:
    """Load a PIL image from a local file path."""
    if not os.path.isfile(image_path):
        raise ValueError(f"File {image_path} does not exist.")
    return Image.open(image_path).convert("RGB")

def load_phi3v():
    """Load Phi3V model and return instance."""
    return LLM(
        model="microsoft/Phi-3.5-vision-instruct",
        trust_remote_code=True,
        max_model_len=4096,
        limit_mm_per_prompt={"image": 1},
    )

def initialize_models():
    """Initialize all models required for inference."""
    global models
    models["phi3_v"] = load_phi3v()

def load_phi3v_prompt(question, image_paths: List[str]):
    placeholders = "\n".join(f"<|image_{i}|>" for i, _ in enumerate(image_paths, start=1))
    prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n"
    stop_token_ids = None
    return prompt, stop_token_ids

def run_generate(model_name: str, question: str, image_paths: List[str]):
    if model_name not in models:
        raise ValueError(f"Model {model_name} is not loaded.")

    llm = models[model_name]
    prompt, stop_token_ids = load_phi3v_prompt(question, image_paths)
    image_data = [load_image_from_path(path) for path in image_paths]

    sampling_params = SamplingParams(temperature=0.0, max_tokens=128, stop_token_ids=stop_token_ids)
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {
                "image": image_data
            },
        },
        sampling_params=sampling_params
    )
    return [o.outputs[0].text for o in outputs]

@app.on_event("startup")
async def startup_event():
    initialize_models()

@app.post("/inference")
async def inference(request: InferenceRequest):
    try:
        start_time = time.time()

        result = run_generate(request.model, request.question, request.image_paths)
        end_time = time.time()
        print("total time taken",end_time-start_time)
        return {"results": result}
    except ValueError as ve:
        raise HTTPException(status_code=400, detail=str(ve))
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8002)

The closest model vllm already supports.

phi-3.5 vision instruct model and need reference for this

What's your difficulty of supporting the model you want?

does not contain any information about quantization for phi-3.5 vision instruct model

Before submitting a new issue...

DarkLight1337 commented 2 months ago

Are you using a custom quantized model? I don't see it on HuggingFace.

thalapandi commented 2 months ago

i am using only phi-3 .5 vision instruct model and wanted to run in vllm with 4 bit quantization and one more douts i have can use engine configure for phi-3.5 model

like this """ Saves each worker's model state dict directly to a checkpoint, which enables a fast load path for large tensor-parallel models where each worker only needs to read its own shard rather than the entire checkpoint.

Example usage:

python save_sharded_state.py \ --model /path/to/load \ --quantization deepspeedfp \ --tensor-parallel-size 8 \ --output /path/to/save

Then, the model can be loaded with

llm = LLM( model="/path/to/save", load_format="sharded_state", quantization="deepspeedfp", tensor_parallel_size=8, ) """ import dataclasses import os import shutil from pathlib import Path

from vllm import LLM, EngineArgs from vllm.utils import FlexibleArgumentParser

parser = FlexibleArgumentParser() EngineArgs.add_cli_args(parser) parser.add_argument("--output", "-o", required=True, type=str, help="path to output checkpoint") parser.add_argument("--file-pattern", type=str, help="string pattern of saved filenames") parser.add_argument("--max-file-size", type=str, default=5 * 1024**3, help="max size (in bytes) of each safetensors file")

def main(args): engine_args = EngineArgs.from_cli_args(args) if engine_args.enable_lora: raise ValueError("Saving with enable_lora=True is not supported!") model_path = engine_args.model if not Path(model_path).is_dir(): raise ValueError("model path must be a local directory")

Create LLM instance from arguments

llm = LLM(**dataclasses.asdict(engine_args))
# Prepare output directory
Path(args.output).mkdir(exist_ok=True)
# Dump worker states to output directory
model_executor = llm.llm_engine.model_executor
model_executor.save_sharded_state(path=args.output,
                                  pattern=args.file_pattern,
                                  max_size=args.max_file_size)
# Copy metadata files to output directory
for file in os.listdir(model_path):
    if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
        if os.path.isdir(os.path.join(model_path, file)):
            shutil.copytree(os.path.join(model_path, file),
                            os.path.join(args.output, file))
        else:
            shutil.copy(os.path.join(model_path, file), args.output)

if name == "main": args = parser.parse_args() main(args)

DarkLight1337 commented 2 months ago

@Isotr0py are you familiar with this?

Isotr0py commented 2 months ago

I'm not sure which quantization "int4 quantization" exactly means here, because seems that there is no BNB 4-bit quantized Phi3-V model released in HF. (The code given above is using deepspeedfp quantization, which should be fp6/fp8 quantization)

If "int4 quantization" just means 4-Bit quantization, Phi-3.5-vision-instruct-AWQ with awq quantization should work on VLLM.

thalapandi commented 2 months ago

How many gpu is need to execute awq quantization? In vllm is it possible to run tensorrt if it is there any documentation for phi-3.5 vision instruct

Isotr0py commented 2 months ago

It costs about 4GB VRAM to run 4-bit awq quantized Phi-3.5-vision-instruct.

BTW, the AWQ model I uploaded is calibrated with default dataset in autoawq, because I just used it to check code consistency. You had better calibrate from source model with your custom datasets to get better quality.

I think vllm can't run tensorrt currently. (FYI, https://github.com/vllm-project/vllm/issues/5134#issuecomment-2139618073)

thalapandi commented 2 months ago

ok

DarkLight1337 commented 2 months ago

Does this work for you?