Open 7flash opened 7 months ago
What's the external chat UI you would like to be able to use?
What's the external chat UI you would like to be able to use?
https://github.com/ztjhz/BetterChatGPT
It works with vllm if you choose "Use custom API endpoint"
Also it shows role dropdown for every message in chat, perhaps i thought it can be another type "outline" next to "assistant" "system" and "user"
+1 on this, having the OAI API exposed from VLLM would be very valuable. Even if it was just pulling through the same API that VLLM can expose in the serve example.
+1
curl http://127.0.0.1:8000/chat/completions \
-d '{
"model": "microsoft/phi-2",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is the capital of France?"
}
],
"schema": {"type": "string", "maxLength": 5}
}'
curl http://127.0.0.1:8000/chat/completions \
-d '{
"model": "microsoft/phi-2",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "What is Pi? Give me the first 15 digits: "
}
],
"regex": "(-)?(0|[1-9][0-9]*)(\\.[0-9]+)?([eE][+-][0-9]+)?"
}'
Currently I have implemented this script based on #598
import argparse
import json
from http import HTTPStatus
from fastapi import FastAPI, Request, Response
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel, Field
from typing import Optional, Union, List, Dict
from outlines.models.openai import OpenAI, OpenAIConfig, openai
import outlines.models as models
import os
import uvicorn
import time
from datetime import datetime
import re
HOST = os.getenv("HOST") or "0.0.0.0"
PORT = os.getenv("PORT") or "8000"
MODEL_NAME = os.getenv("MODEL_NAME") or "Nous-Hermes-2-Mixtral-8x7B-DPO"
import nest_asyncio
nest_asyncio.apply()
class ChatCompletionInput(BaseModel):
prompt: Union[str, List[str]]
model: str = ""
frequency_penalty: float = 0.0
logit_bias: Dict[int, int] = {}
max_tokens: Optional[int] = None
n: int = 1
presence_penalty: float = 0.0
response_format: Optional[Dict[str, str]] = None
seed: Optional[int] = None
stop: Optional[Union[str, List[str]]] = None
temperature: float = 1.0
top_p: int = 1
user: str = ""
stream: Optional[bool] = None
class Choice(BaseModel):
finish_reason: Optional[str] = None
index: int = 0
delta: Dict[str, Optional[str]]
class ChatResponse(BaseModel):
id: str = "chatcmpl-8oDFQAzydU0sfeVVo5eOMoLRhKbNz"
choices: List[Choice]
created: int
model: str
object: str
system_fingerprint: Optional[str] = None
usage: Dict[str, int] = {}
app = FastAPI()
def parse_args():
parser = argparse.ArgumentParser(
description="OpenAI-Compatible RESTful API server using OpenAI's GPT model."
)
parser.add_argument("--host", type=str, default="127.0.0.1", help="host name")
parser.add_argument("--port", type=int, default=8000, help="port number")
parser.add_argument(
"--allow-credentials", action="store_true", help="allow credentials"
)
parser.add_argument(
"--allowed-origins", type=json.loads, default=["*"], help="allowed origins"
)
parser.add_argument(
"--allowed-methods", type=json.loads, default=["*"], help="allowed methods"
)
parser.add_argument(
"--allowed-headers", type=json.loads, default=["*"], help="allowed headers"
)
return parser.parse_args()
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc):
return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
def create_error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
return JSONResponse(
{"message": message, "type": "invalid_request_error"},
status_code=status_code,
)
@app.get("/health")
async def health() -> Response:
"""Health check."""
return Response(status_code=200)
@app.get("/")
async def index() -> Response:
"""Health check."""
return Response(status_code=200)
@app.post("/v1/chat/completions")
@app.post("/chat/completions")
async def chat_completions(request_body: Request):
try:
body = await request_body.json()
presence_penalty = body.get("presence_penalty", 0.0)
frequency_penalty = body.get("frequency_penalty", 0.0)
temperature = body.get("temperature", 1.0)
top_p = body.get("top_p", 1)
model = body.get("model", MODEL_NAME)
messages = body.get("messages", [])
stream = True
config = OpenAIConfig(
temperature=temperature,
frequency_penalty=frequency_penalty,
presence_penalty=presence_penalty,
top_p=top_p
)
complete = models.openai_compatible_api(model, base_url="https://my-litellm-server", api_key="sk-xx", encoding="gpt-4", config=config)
def resp(text: str):
finish_reason = "stop" if text == "[DONE]" else None
content = "" if text == "[DONE]" else text
it = ChatResponse(
choices=[
Choice(
index=0,
delta={"function_call": None, "tool_calls": None, "content": content, "role": "assistant"},
finish_reason=finish_reason
)
],
created=int(datetime.now().timestamp()),
model=model,
object="chat.completion.chunk"
)
xit = json.dumps(it.dict())
ixit = f"data: {xit}\n\n"
return ixit
def extract_content(msg):
if 'content' in msg:
if msg['role'] == 'assistant':
match = re.search(r"\'\'\'(.*?)\'\'\'", msg['content'], re.DOTALL)
if match:
return match.group(1).strip()
else:
return msg['content']
else:
return msg['content']
else:
return ''
async def response_streamer():
yield resp("My Response Prefix")
prompt_context = "\n".join([f"{msg['role']}: {extract_content(msg)}" for msg in messages])
yield resp(f"\n'''\n")
response = complete(f"""
Continue the conversation as an Assistant:
{prompt_context}
Assistant:
""")
yield resp(f"{response}\n")
yield resp(f"\n'''\n")
yield resp("My Response Suffix")
yield resp("[DONE]")
return StreamingResponse(response_streamer(), media_type='text/event-stream')
except Exception as e:
return create_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, str(e))
if __name__ == "__main__":
args = parse_args()
app.add_middleware(
CORSMiddleware,
allow_origins=args.allowed_origins,
allow_credentials=args.allow_credentials,
allow_methods=args.allowed_methods,
allow_headers=args.allowed_headers,
)
uvicorn.run(app, host=HOST, port=int(PORT), log_level="info")
And now it works in any openai-compatible UI
I am not sure what this line does but otherwise it throws an error
import nest_asyncio
nest_asyncio.apply()
Do you think similar functionality can have place embedded in outlines to expose any of example scripts with openai api?
I understand currently outlines already serves OpenAI API with vllm, but what I would like to be actually able utilize all the features of outlines while using external chat UI, perhaps it can be another message type "outline" which would translate to generate.cfg(model, message) ?