Open xjDUAN184 opened 6 months ago
This is when tensor_parallel_size=2 is used:
I once had this problem on GCP with an older version of vLLM. It turned out there was an error in ray to detect GPUs on GCP. That problem got fixed some weeks ago. Can you try with the newest version of vLLM? Because with vLLM 0.2.1 you're using an old one.
My device is A800. When I do not set the tensor_parallel_size parameter, the vllm+qwen14b api can start normally and automatically use card 0.
When I set tensor_parallel_size=2, a RunTimeError occurs.
The main contents are: RuntimeError: ProcessGroupNCCL is ony supported with GPUs, no GPUs found! Warning: CUDA initialization Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 103: intergrity checks failed (function operator())
I verified the following options:
My envs: cuda 11.7 torch 2.0.1 python 3.8.16 vllm 0.2.1
How should this problem be solved? My api code:
import argparse import asyncio import json import time from http import HTTPStatus from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union import torch import re import fastapi import uvicorn from fastapi import Request from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse,Response from packaging import version import time from typing import Any # 新加代码 import logging from datetime import datetime, timedelta, timezone # 新加代码 from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.entrypoints.openai.protocol import ( CompletionRequest, CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse, LogProbs, ModelCard, ModelList, ModelPermission, UsageInfo) from vllm.logger import init_logger, _setup_logger from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer import get_tokenizer from vllm.utils import random_uuid from vllm.outputs import CompletionOutput from collections import defaultdict from difflib import SequenceMatcher try: import fastchat from fastchat.conversation import Conversation, SeparatorStyle from fastchat.model.model_adapter import get_conversation_template _fastchat_available = True except ImportError: _fastchat_available = False
TIMEOUT_KEEP_ALIVE = 5 # seconds
logger = init_logger(name) served_model = None app = fastapi.FastAPI() engine = None
@app.get("/health") async def health() -> Response: """Health check.""" return Response(status_code=200)
def manual_tokenize(text): tokens = re.split(r'([0-9]+|[,。、?!“”‘’,.-!?:";()]|\s|[\u4e00-\u9fff])', text)
tokens = [x for x in tokens if x != ''] return tokens
def remove_repetition(text, n=2, rep=5):
words = [*text]
def remove_repetition_line(text, threshold=0.99): seen = set() output = []
def content_chuli(content): if "\n\n" in content: content=content.replace("\n\n","\n") if "assistant" in content: content=content.replace("assistant","")
content=remove_repetition_line(content, 0.95) content = remove_repetition(content, 4, 8) return content.encode("UTF-8").decode("UTF-8")
def create_error_response(status_code: HTTPStatus, message: str) -> JSONResponse: return JSONResponse(ErrorResponse(message=message, type="invalid_request_error").dict(), status_code=status_code.value)
@app.exception_handler(RequestValidationError) async def validation_exception_handler(request, exc): # pylint: disable=unused-argument return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
async def check_model(request) -> Optional[JSONResponse]: global MODEL_PATH
if request.model == served_model:
async def get_gen_prompt(request) -> str: system_prompt=request.system_prompt query=request.prompt if "在生成的回答中你需要覆盖以下8个问题:\n1. 研究的是什么" in query: cue1='请根据提供的论文,完成论文的摘要,摘要字数需要控制在500字。在生成的回答中你需要覆盖以下8个问题:\n1. 研究的是什么\n2. 为什么会发起这项研究\n3. 使用了什么研究方法\n4. 研究的实验结果是什么\n5. 研究的结论是什么\n6. 研究的意义是什么\n7. 研究的关键步骤是什么\n8. 获取文章标题,获取失败则生成标题;\n这段是需要生成技术报告的文本\n' cue2='\n输出格式要求如下,从8个方面给出回答:\n1. 研究问题:\n2. 研究原因:\n3. 研究方法:\n4. 研究结果:\n5. 研究结论:\n6. 研究意义:\n7. 研究脉络:\n8. 文章标题:' query = query.replace(cue1, '') query = query.replace(cue2, '') query=' '.join(query.split()) query=cue1+query+cue2 else: query=' '.join(request.prompt.split())
async def check_length( request: Union[ChatCompletionRequest, CompletionRequest], prompt: Optional[str] = None, prompt_ids: Optional[List[int]] = None ) -> Tuple[List[int], Optional[JSONResponse]]: assert (not (prompt is None and prompt_ids is None) and not (prompt is not None and prompt_ids is not None) ), "Either prompt or prompt_ids should be provided." if prompt_ids is not None: input_ids = prompt_ids else: input_ids = tokenizer(prompt).input_ids token_num = len(input_ids)
@app.get("/v1/models") async def show_available_models(): """Show available models. Right now we only have one model.""" model_cards = [ ModelCard(id=served_model, root=served_model, permission=[ModelPermission()]) ] return ModelList(data=model_cards)
def create_logprobs(token_ids: List[int], id_logprobs: List[Dict[int, float]], initial_text_offset: int = 0) -> LogProbs: """Create OpenAI-style logprobs.""" logprobs = LogProbs() last_token_len = 0 for token_id, id_logprob in zip(token_ids, id_logprobs): token = tokenizer.convert_ids_to_tokens(token_id) logprobs.tokens.append(token) logprobs.token_logprobs.append(id_logprob[token_id]) if len(logprobs.text_offset) == 0: logprobs.text_offset.append(initial_text_offset) else: logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len) last_token_len = len(token)
@app.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request):
if name == "main": parser = argparse.ArgumentParser( description="vLLM OpenAI-Compatible RESTful API server.") parser.add_argument("--host", type=str, default=None, help="host name") parser.add_argument("--port", type=int, default=8098, help="port number") parser.add_argument("--allow-credentials", action="store_true", help="allow credentials") parser.add_argument("--allowed-origins", type=json.loads, default=[""], help="allowed origins") parser.add_argument("--allowed-methods", type=json.loads, default=[""], help="allowed methods") parser.add_argument("--allowed-headers", type=json.loads, default=["*"], help="allowed headers") parser.add_argument("--served-model-name", type=str, default=None, help="The model name used in the API. If not " "specified, the model name will be the same as " "the huggingface name.") parser.add_argument("--model_type", type=str, default=None, help="The model name in huggingface.") parser.add_argument("--tensor_parallel_size", type=int, default=1, help="number of gpus to use") parser.add_argument("--gpu_memory_utilization", type=float, default=0.90) parser = AsyncEngineArgs.add_cli_args(parser) args = parser.parse_args()