dottxt-ai / outlines

Structured Text Generation
https://dottxt-ai.github.io/outlines/
Apache License 2.0
8.26k stars 421 forks source link

Expose OpenAI API with CFG #594

Open 7flash opened 7 months ago

7flash commented 7 months ago

I understand currently outlines already serves OpenAI API with vllm, but what I would like to be actually able utilize all the features of outlines while using external chat UI, perhaps it can be another message type "outline" which would translate to generate.cfg(model, message) ?

rlouf commented 7 months ago

What's the external chat UI you would like to be able to use?

7flash commented 7 months ago

What's the external chat UI you would like to be able to use?

https://github.com/ztjhz/BetterChatGPT

It works with vllm if you choose "Use custom API endpoint"

Also it shows role dropdown for every message in chat, perhaps i thought it can be another type "outline" next to "assistant" "system" and "user"

davidsyoung commented 7 months ago

+1 on this, having the OAI API exposed from VLLM would be very valuable. Even if it was just pulling through the same API that VLLM can expose in the serve example.

louis030195 commented 7 months ago

+1

curl http://127.0.0.1:8000/chat/completions \
    -d '{
        "model": "microsoft/phi-2",
        "messages": [
          {
            "role": "system",
            "content": "You are a helpful assistant."
          },
          {
            "role": "user",
            "content": "What is the capital of France?"
          }
        ],
        "schema": {"type": "string", "maxLength": 5}
        }'

curl http://127.0.0.1:8000/chat/completions \
    -d '{
        "model": "microsoft/phi-2",
        "messages": [
          {
            "role": "system",
            "content": "You are a helpful assistant."
          },
          {
            "role": "user",
            "content": "What is Pi? Give me the first 15 digits: "
          }
        ],
        "regex": "(-)?(0|[1-9][0-9]*)(\\.[0-9]+)?([eE][+-][0-9]+)?"
        }'
7flash commented 7 months ago

Currently I have implemented this script based on #598

import argparse
import json
from http import HTTPStatus
from fastapi import FastAPI, Request, Response
from fastapi.exceptions import RequestValidationError
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse, StreamingResponse
from pydantic import BaseModel, Field
from typing import Optional, Union, List, Dict
from outlines.models.openai import OpenAI, OpenAIConfig, openai
import outlines.models as models
import os
import uvicorn
import time
from datetime import datetime
import re

HOST = os.getenv("HOST") or "0.0.0.0"
PORT = os.getenv("PORT") or "8000"
MODEL_NAME = os.getenv("MODEL_NAME") or "Nous-Hermes-2-Mixtral-8x7B-DPO"

import nest_asyncio
nest_asyncio.apply()

class ChatCompletionInput(BaseModel):
    prompt: Union[str, List[str]]
    model: str = ""
    frequency_penalty: float = 0.0
    logit_bias: Dict[int, int] = {}
    max_tokens: Optional[int] = None
    n: int = 1
    presence_penalty: float = 0.0
    response_format: Optional[Dict[str, str]] = None
    seed: Optional[int] = None
    stop: Optional[Union[str, List[str]]] = None
    temperature: float = 1.0
    top_p: int = 1
    user: str = ""
    stream: Optional[bool] = None

class Choice(BaseModel):
    finish_reason: Optional[str] = None
    index: int = 0
    delta: Dict[str, Optional[str]]

class ChatResponse(BaseModel):
    id: str = "chatcmpl-8oDFQAzydU0sfeVVo5eOMoLRhKbNz"
    choices: List[Choice]
    created: int
    model: str
    object: str
    system_fingerprint: Optional[str] = None
    usage: Dict[str, int] = {}

app = FastAPI()

def parse_args():
    parser = argparse.ArgumentParser(
        description="OpenAI-Compatible RESTful API server using OpenAI's GPT model."
    )
    parser.add_argument("--host", type=str, default="127.0.0.1", help="host name")
    parser.add_argument("--port", type=int, default=8000, help="port number")
    parser.add_argument(
        "--allow-credentials", action="store_true", help="allow credentials"
    )
    parser.add_argument(
        "--allowed-origins", type=json.loads, default=["*"], help="allowed origins"
    )
    parser.add_argument(
        "--allowed-methods", type=json.loads, default=["*"], help="allowed methods"
    )
    parser.add_argument(
        "--allowed-headers", type=json.loads, default=["*"], help="allowed headers"
    )
    return parser.parse_args()

@app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc):
    return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))

def create_error_response(status_code: HTTPStatus, message: str) -> JSONResponse:
    return JSONResponse(
        {"message": message, "type": "invalid_request_error"},
        status_code=status_code,
    )
@app.get("/health")
async def health() -> Response:
    """Health check."""
    return Response(status_code=200)

@app.get("/")
async def index() -> Response:
    """Health check."""
    return Response(status_code=200)

@app.post("/v1/chat/completions")
@app.post("/chat/completions")
async def chat_completions(request_body: Request):
    try:
        body = await request_body.json()

        presence_penalty = body.get("presence_penalty", 0.0)
        frequency_penalty = body.get("frequency_penalty", 0.0)
        temperature = body.get("temperature", 1.0)
        top_p = body.get("top_p", 1)
        model = body.get("model", MODEL_NAME)
        messages = body.get("messages", [])
        stream = True

        config = OpenAIConfig(
            temperature=temperature,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            top_p=top_p
        )
        complete = models.openai_compatible_api(model, base_url="https://my-litellm-server", api_key="sk-xx", encoding="gpt-4", config=config)

        def resp(text: str):
            finish_reason = "stop" if text == "[DONE]" else None
            content = "" if text == "[DONE]" else text
            it = ChatResponse(
                choices=[
                    Choice(
                        index=0,
                        delta={"function_call": None, "tool_calls": None, "content": content, "role": "assistant"},
                        finish_reason=finish_reason
                    )
                ],
                created=int(datetime.now().timestamp()),
                model=model,
                object="chat.completion.chunk"
            )
            xit = json.dumps(it.dict())
            ixit = f"data: {xit}\n\n"
            return ixit

        def extract_content(msg):
            if 'content' in msg:
                if msg['role'] == 'assistant':
                    match = re.search(r"\'\'\'(.*?)\'\'\'", msg['content'], re.DOTALL)
                    if match:
                        return match.group(1).strip()
                    else:
                        return msg['content']
                else:
                    return msg['content']
            else:
                return ''

        async def response_streamer():
            yield resp("My Response Prefix")

            prompt_context = "\n".join([f"{msg['role']}: {extract_content(msg)}" for msg in messages])

            yield resp(f"\n'''\n")
            response = complete(f"""
                Continue the conversation as an Assistant:

                {prompt_context}
                Assistant:
            """)
            yield resp(f"{response}\n")
            yield resp(f"\n'''\n")

            yield resp("My Response Suffix")

            yield resp("[DONE]")

        return StreamingResponse(response_streamer(), media_type='text/event-stream')
    except Exception as e:
        return create_error_response(HTTPStatus.INTERNAL_SERVER_ERROR, str(e))

if __name__ == "__main__":
    args = parse_args()

    app.add_middleware(
        CORSMiddleware,
        allow_origins=args.allowed_origins,
        allow_credentials=args.allow_credentials,
        allow_methods=args.allowed_methods,
        allow_headers=args.allowed_headers,
    )

    uvicorn.run(app, host=HOST, port=int(PORT), log_level="info")

And now it works in any openai-compatible UI

I am not sure what this line does but otherwise it throws an error

import nest_asyncio
nest_asyncio.apply()

Do you think similar functionality can have place embedded in outlines to expose any of example scripts with openai api?