There is no problem running with the official demo:
from transformers import AutoTokenizer
from PIL import Image
from vllm import LLM, SamplingParams,EngineArgs,LLMEngine
MODEL_NAME = "/media/sofun/linux/model/MiniCPM-V-2_6"
# Also available for previous models
# MODEL_NAME = "openbmb/MiniCPM-Llama3-V-2_5"
# MODEL_NAME = "HwwwH/MiniCPM-V-2"
image = Image.open("/home/sofun/桌面/graph.png").convert("RGB")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
llm = LLM(
model=MODEL_NAME,
trust_remote_code=True,
gpu_memory_utilization=1,
max_model_len=2048
)
messages = [{
"role":
"user",
"content":
# Number of images
"(<image>./</image>)" + \
"\n这是一张什么图片?"
}]
prompt = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# Single Inference
inputs = {
"prompt": prompt,
"multi_modal_data": {
"image": image
# Multi images, the number of images should be equal to that of `(<image>./</image>)`
# "image": [image, image]
},
}
# Batch Inference
# inputs = [{
# "prompt": prompt,
# "multi_modal_data": {
# "image": image
# },
# } for _ in 2]
# 2.6
stop_tokens = ['<|im_end|>', '<|endoftext|>']
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
# 2.0
# stop_token_ids = [tokenizer.eos_id]
# 2.5
# stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
sampling_params = SamplingParams(
stop_token_ids=stop_token_ids,
use_beam_search=True,
temperature=0,
best_of=3,
max_tokens=1024
)
outputs = llm.generate(inputs, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
The above code can correctly describe the image content
But after switching to LLMEngine for inference, it is completely unable to describe the image content correctly. The SamplingParams parameters are exactly the same. I don’t know why? Because only LLMEngine is better for streaming, so I hope to use LLMEngine.
async def process_requests(engine: LLMEngine, test_prompts: List[Tuple[str,str, SamplingParams]]): # 在test_prompts中加一个id
communicator = ZMQCommunicator()
print(RED+f"{communicator}"+RESET)
whole_tokens = {}
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
# if request_ids != 0 :
# assert isinstance(request_ids,list) # 图过传入了request_ids这个参数,这个参数必须是list
# request_id = request_ids.pop(0)
request_id,prompt, sampling_params = test_prompts.pop(0)
print(RED+'request_id'+RESET,request_id)
engine.add_request(str(request_id), prompt, sampling_params)
# if request_ids == 0 :
request_outputs = extract_request_data(engine.step())
# print(request_outputs,type(request_outputs))
for request_output_key,request_output_value in request_outputs.items():
# print(request_output_value,type(request_output_value))
if request_output_value["finished"] == 1:
whole_tokens[request_output_key]="".join(request_output_value['whole_tokens'])
await communicator.send_message(request_outputs)
# print(f"消息已发送到MQ队列:{request_outputs}")
return whole_tokens
Your current environment
python 3.10 vllm 0.5.4
🐛 Describe the bug
There is no problem running with the official demo:
The above code can correctly describe the image content But after switching to LLMEngine for inference, it is completely unable to describe the image content correctly. The SamplingParams parameters are exactly the same. I don’t know why? Because only LLMEngine is better for streaming, so I hope to use LLMEngine.