OSError: Error no file named model.safetensors found in directory

Hi, I have finetuned Qwen2-VL using Llama-Factory. I successfully quantized the fine-tuned model as given

from transformers import Qwen2VLProcessor
from auto_gptq import BaseQuantizeConfig
from auto_gptq.modeling import Qwen2VLGPTQForConditionalGeneration
from qwen_vl_utils import process_vision_info
import torch
torch.cuda.empty_cache()

# Specify paths and hyperparameters for quantization
model_path = "/content/drive/MyDrive/LLM/vinplate2-3000-merged"
quant_path = "/content/drive/MyDrive/LLM/vinplate2-gwen2-vl-gptq-4bit"
quantize_config = BaseQuantizeConfig(
    bits=4,  # 4 or 8
    group_size=128,
    damp_percent=0.1,
    desc_act=False,  # set to False can significantly speed up inference but the perplexity may slightly bad
    static_groups=False,
    sym=True,
    true_sequential=True,
)
# Load your processor and model with AutoGPTQ
processor = Qwen2VLProcessor.from_pretrained(model_path)
# We recommend enabling flash_attention_2 for better acceleration and memory saving
model = Qwen2VLGPTQForConditionalGeneration.from_pretrained(model_path, quantize_config, attn_implementation="flash_attention_2")
# model = Qwen2VLGPTQForConditionalGeneration.from_pretrained(model_path, quantize_config)
model.to("cuda:0")

import ast
my_file = open("/content/drive/MyDrive/LLM/dataset_caliber.txt", "r")

# reading the file
data = my_file.read()

data_into_list = data.split("\n")
dataset = data_into_list[:-1]

final_dataset = []
for x in dataset:
    # print('x')
    # print(x)
    x1 = ast.literal_eval(x)
    final_dataset.append(x1)
def batched(iterable, n: int):
    # batched('ABCDEFG', 3) → ABC DEF G
    assert n >= 1, "batch size must be at least one"
    from itertools import islice

    iterator = iter(iterable)
    while batch := tuple(islice(iterator, n)):
        yield batch

batch_size = 1
calib_data = []
for batch in batched(final_dataset, batch_size):
    text = processor.apply_chat_template(
        batch, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(batch)
    inputs = processor(
        text=text,
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    calib_data.append(inputs)

model.quantize(calib_data, cache_examples_on_gpu=False)
model.save_quantized(quant_path, use_safetensors=True)
processor.save_pretrained(quant_path)

But then I tried to infer the quantized custom Qwen2-VL model using this code...

from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import time
import json
import re
import torch
torch.cuda.empty_cache()
DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

modelpath = '/content/drive/MyDrive/LLM/vinplate2-gwen2-vl-gptq-4bit'

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
model = Qwen2VLForConditionalGeneration.from_pretrained(
    modelpath,
    torch_dtype="auto",
    attn_implementation="flash_attention_2",
    device_map=DEVICE,
    use_safetensors=True,
    # disable_exllama=True
)
model.to(DEVICE)

# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
min_pixels = 256 * 28 * 28
max_pixels = 1280 * 28 * 28

processor = AutoProcessor.from_pretrained(
    modelpath, min_pixels=min_pixels, max_pixels=max_pixels
)
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "/content/drive/MyDrive/LLM/test/Vin_2023-12-22_14-47-37.jpg",
            },
            {"type": "text", "text": 
                                    '''
                                    Please extract the Vehicle Sr No, Engine No, and Model from this image.
                                    Response only json format nothing else.
                                    Analyze the font and double check for similar letters such as "V":"U", "8":"S":"0", "R":"P".
                                    '''
            },
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
).to(DEVICE)

# Inference: Generation of the output
t1 = time.time()
output_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
t2 = time.time()
print(output_text)
print(t2-t1)

I got this error: OSError: Error no file named model.safetensors found in directory /content/drive/MyDrive/LLM/vinplate2-gwen2-vl-gptq-4bit.

I am not sure what I did wrong. Please help me. I think its transformer version issue but not sure which version is correct.

My Enviroment: Linux (Google Colab) CUDA 12.2 Python 3.10.12 transformers 4.45.0.dev0 (pip install git+https://github.com/huggingface/transformers@21fac7abba2a37fae86106f87fcf9974fd1e3830 accelerate) torch 2.4.1+cu121 auto_gptq 0.6.0.dev0+cu1222 accelerate 1.0.1 ninja 1.11.1.1 tokenizers 0.19.1 flash_attn 2.6.3

QwenLM / Qwen2-VL

OSError: Error no file named model.safetensors found in directory #492