Closed aayushsss1 closed 2 months ago
I'm getting 404 not found error using the bellow code
from langchain_openai import OpenAI
llm = OpenAI(
base_url=f"http://{INGRESS_HOST}:{INGRESS_PORT}/v1/",
api_key=api_key,
default_headers={
"Host": SERVICE_HOSTNAME,
},
model="MODEL_NAME",
temperature=0.8,
top_p=1,
)
messages = [
(
"system",
"You are a helpful assistant that translates English to French. Translate the user sentence.",
),
("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg
Kserve version 0.13.1
I saw in the README that you are using the Hugging faces runtime. In my case I've deployed an LLM with the Custom Python Serving Runtime https://kserve.github.io/website/latest/modelserving/v1beta1/custom/custom_model/
The Open Inference API is working well, but having an OpenAI inference endpoint will be better for the developers.
Hi @allilou , did you try to remove the /
after the v1
in your example? can you share your custom runtime implementation if possible.
I'm using this code to make the call
from openai import OpenAI
client = OpenAI(
base_url=f"{SERVER_URL}/openai/v1",
)
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": "Say this is a test",
}
],
model="my-model-name",
)
And I'm getting this error
Traceback (most recent call last):
File "../test_openai_api_on_kserve.py", line 11, in <module>
chat_completion = client.chat.completions.create(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vscode/.local/lib/python3.11/site-packages/openai/_utils/_utils.py", line 274, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/vscode/.local/lib/python3.11/site-packages/openai/resources/chat/completions.py", line 679, in create
return self._post(
^^^^^^^^^^^
File "/home/vscode/.local/lib/python3.11/site-packages/openai/_base_client.py", line 1260, in post
return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/vscode/.local/lib/python3.11/site-packages/openai/_base_client.py", line 937, in request
return self._request(
^^^^^^^^^^^^^^
File "/home/vscode/.local/lib/python3.11/site-packages/openai/_base_client.py", line 1041, in _request
raise self._make_status_error_from_response(err.response) from None
openai.NotFoundError: Error code: 404 - {'detail': 'Not Found'}
My custom model is the following (Kserve 0.13)
class KserveLLM(kserve.Model):
llm: LLMEngine
def __init__(self, name: str):
super().__init__(name)
self.name = name
self.ready = False
def load(self):
engine_args = EngineArgs(
model=LLM_ID, download_dir=LLM_PATH, dtype="half", enforce_eager=True
)
self.llm = LLMEngine.from_engine_args(engine_args)
def predict(self, request: InferRequest, headers: Dict = None) -> Dict:
"""class method wrapping self.model.predict()
"""
input_query = ""
max_new_tokens = 1000
temperature = 0.01
top_k = 2
top_p = 0.01
for input in request.inputs:
if input.name == "input_text":
input_query = input.data[0]
elif input.name == "temperature":
temperature = input.data[0]
elif input.name == "top_k":
top_k = input.data[0]
elif input.name == "top_p":
top_p = input.data[0]
elif input.name == "max_new_tokens":
max_new_tokens = input.data[0]
if len(input_query) == 0:
error_message = "Empty Query text !!!"
self.logger.warning(f"[LLM]: {error_message}")
return self._error_response(error_message)
self.llm_sampling_params = SamplingParams(
temperature=temperature,
max_tokens=max_new_tokens,
top_k=top_k,
top_p=top_p,
)
self.logger.info(f"[LLM] Query: {input_query}")
self.llm.add_request("0", input_query, self.llm_sampling_params)
output_text = ""
try:
while True:
request_outputs = self.llm.step()
for request_output in request_outputs:
if request_output.finished:
output_text += " ".join(
[o.text for o in request_output.outputs]
)
if not (self.llm.has_unfinished_requests()):
break
except Exception as e:
error_message = "ERROR: Faild to generate predictions !!"
self.logger.error(f"[LLM] {error_message} {str(e)}")
return self._error_response(error_message)
self.logger.info(f"[LLM-OUT]: {output_text}")
response_id = generate_uuid()
infer_output = InferOutput(
name="predictions", shape=[1, 1], datatype="FP32", data=[output_text]
)
infer_response = InferResponse(
model_name=self.name, infer_outputs=[infer_output], response_id=response_id
)
return infer_response
if __name__ == "__main__":
models_list: list[Any] = []
# model servers
llm: KserveLLM = KserveLLM("my-model-name")
try:
llm.load()
models_list.append(llm)
except ModelMissingError:
_logger.error(f"fail to load model [LLM]")
if len(models_list) == 0:
print(f"[NO MODEL TO LOAD]")
exit()
print(
f"[LOADED]: {[type(model).__name__ for model in models_list]}"
)
print(f"{Fore.BLUE}[SERVER]: STARTING")
# Init ModelServer
model_server = ModelServer(http_port=8080, workers=1, enable_docs_url=True)
# Start server
model_server.start(models_list)
Hi @allilou , you example is using Kserve.Model
to build custom model but you are trying to use openAI SDK to connect to the model. I think the model should be able to connected via open inference protocol like this. However I don't think it will work out of the box with openai SDK (except the huggingface server). Myabe @yuzisun can comment. I will suggest you to raise an issue in the KServe repo as well.
You should implement the OpenAIModel chat completion API instead.
Describe the change you'd like to see
With KServe now supporting the OpenAI Schema for LLM runtimes, few examples of use cases using native Langchain and Llamaindex features like text generation, RAG—QA, chat, etc. with KServe-hosted models
Additional context Add any other context or screenshots about the feature request here.
Sample call -
Original Issue - https://github.com/kserve/kserve/issues/3419