Open smilebetterworld opened 2 hours ago
推理代码为: from PIL import Image from transformers import AutoTokenizer from vllm import LLM, SamplingParams import time import GPUtil
IMAGES = [ "/data/1666770191808_crop_0.jpg", # 本地图片路径 ]
MODEL_NAME = "/data/MiniCPM-V_2_6_awq_int4"
image = Image.open(IMAGES[0]).convert("RGB")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
llm = LLM(model=MODEL_NAME, gpu_memory_utilization=1, # 使用全部GPU内存 trust_remote_code=True, max_model_len=2048) # 根据内存状况可调整此值
question = "extract only raw text from the given image.Don't add any information or commentary."
messages = [{'role': 'user', 'content': '(
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
sampling_params = SamplingParams( stop_token_ids=stop_token_ids, max_tokens=1024, temperature=0, best_of=1)
st = time.time()
outputs = llm.generate({ "prompt": prompt, "multi_modal_data": { "image": image } }, sampling_params=sampling_params)
latency = time.time() - st gpus = GPUtil.getGPUs() for gpu in gpus: print(f"GPU ID: {gpu.id}, GPU负载: {round(gpu.load*100,2)}%, Memory Total: {gpu.memoryTotal}MB, 显存占用: {gpu.memoryUsed}MB, Memory Free: {gpu.memoryFree}MB") print('latency is {} seconds'.format(latency))
git clone https://www.modelscope.cn/models/linglingdan/MiniCPM-V_2_6_awq_int4 用这个量化后的INT4模型推理,显存占用大概20G,和fp模型显存占用情况基本一样,请教下是不是量化存在问题?