glz-11 commented 4 weeks ago

sampling_params = SamplingParams( temperature=0.1, top_p=0.001, repetition_penalty=1.05, max_tokens=2048, stop_token_ids=[],
stream=Ture #这里不支持流式。。。）

    generator = llm.generate([llm_inputs], sampling_params=sampling_params,    加这里也不行   )  ，，

Uhao-P commented 3 weeks ago

1.直接用lmdeploy 2.借鉴套壳openai接口的项目,例如:https://github.com/xusenlinzy/api-for-open-llm

ywang96 commented 3 weeks ago

Feel free to take a look at https://docs.vllm.ai/en/latest/getting_started/examples/openai_vision_api_client.html#openai-vision-api-client

To enable streaming, you'll need to specify stream=True like the example below

chat_completion_from_url = client.chat.completions.create(
    messages=[{
        "role":
        "user",
        "content": [
            {
                "type": "text",
                "text": "What's in this image?"
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": image_url
                },
            },
        ],
    }],
    model=model,
    max_tokens=64,
+   stream=True,
)

glz-11 commented 3 weeks ago

谢谢！！openai的api接口还不支持多视频输入！我没选择用它。我用这种方式： ##################### envs.py 文件修改了'spawn'方式 /home/by/miniconda3/envs/qwen2_vl/lib/python3.12/site-packages/vllm/envs.py 里面有修改位置标记,不改会报spawn错误

llm = vllm.LLM(
    model=model_path,
    # limit_mm_per_prompt={"image": 10, "video": 10},  # 会报错
    tensor_parallel_size=2,
    dtype="bfloat16",
    gpu_memory_utilization=0.5,
    max_model_len=2048,   #  这个参数哪来的不知道！！！！字数限制，否则显存不够 ,太小输出无法显示全
    # max_seq_len_to_capture= 1024,  会报错！！

)

怎么弄也没实现流式

ywang96 commented 3 weeks ago

The LLM class is actually only designed for offline inference (thus you can't do streaming with it).

glz-11 commented 3 weeks ago

Thank you! I hoped to deploy it like this! It didn't succeed.

import flash_attn import sys from flask import Flask, request, Response from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer import torch from threading import Thread import signal import sys import time import vllm from vllm import LLM, SamplingParams from qwen_vl_utils import process_vision_info from flask import Flask, request, Response,jsonify

app = Flask(name)

def initialize_llm(model_path): """ 初始化LLM对象。

:param model_path: 模型的本地路径
:return: LLM 对象
"""

####################### envs.py 文件修改了'spawn'方式 /home/by/miniconda3/envs/qwen2_vl/lib/python3.12/site-packages/vllm/envs.py 里面有修改位置标记,不改会报spawn错误 llm = vllm.LLM( model=model_path,

limit_mm_per_prompt={"image": 10, "video": 10}, # 会报错

    tensor_parallel_size=2,
    dtype="bfloat16",
    gpu_memory_utilization=0.5,
    max_model_len=2048,   #  这个参数哪来的不知道！！！！字数限制，否则显存不够 ,太小输出无法显示全
    # max_seq_len_to_capture= 1024,  会报错！！

)
return llm

@app.route('/predict', methods=['POST']) def predict(): """ 处理请求。 """

data = request.get_json()

# 获取消息列表
messages = data.get('messages')

processor = AutoProcessor.from_pretrained("/home/by/.cache/modelscope/hub/qwen/Qwen2-VL-7B-Instruct")

prompt = processor.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,

)

sampling_params = SamplingParams(
    temperature=0.1,
    top_p=0.001,
    repetition_penalty=1.05,
    max_tokens=2048,
    stop_token_ids=[],

)

image_inputs, video_inputs = process_vision_info(messages)

mm_data = {}
if image_inputs is not None:
    mm_data["image"] = image_inputs
if video_inputs is not None:
    mm_data["video"] = video_inputs

llm_inputs = {
    "prompt": prompt,
    "multi_modal_data": mm_data,

}

# # 初始化 TextStreamer
# tokenizer = processor.tokenizer
# streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)

def generate():
    generator = llm.generate([llm_inputs], sampling_params=sampling_params,)
    for output in generator:

        text = output.outputs[0].text

        print(text)   
        yield text.encode('utf-8')
        # time.sleep(0.1)

return Response(generate(), mimetype='text/plain', direct_passthrough=True)  # Response 流式返回的方法。   direct_passthrough=True 控制是否流式返回

def terminate_process(): """ 终止进程及其子进程。 """ print("正在终止服务...") sys.exit(0)

if name == "main":

设置模型名称和路径

model_path = "/home/by/.cache/modelscope/hub/qwen/Qwen2-VL-7B-Instruct"

# 初始化模型
llm = initialize_llm(model_path)

# 添加信号处理，以便能够在接收到 SIGINT (Ctrl+C) 终止服务
def signal_handler(sig, frame):
    print("接收到中断信号，正在终止服务...")
    terminate_process()

# 注册信号处理函数
signal.signal(signal.SIGINT, signal_handler)

# 启动 Flask 服务
app.run(host='0.0.0.0', port=7000)

glz-11 commented 3 weeks ago

哎！现放弃vllm部署了！！

ywang96 commented 3 weeks ago

If you're building an API server out of vLLM, you should use the AsyncLLMEngine object.

We have a toy example here https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/api_server.py.

glz-11 commented 3 weeks ago

Thanks, I'll take a look and study it.

QwenLM / Qwen2-VL

请问下Qwen2-vl 用vllm部署，如何流式输出，哪有资料可以参考！！ #278

limit_mm_per_prompt={"image": 10, "video": 10}, # 会报错

设置模型名称和路径