b'{"created":"2024-01-19T00:12:28.434897","prompts":"Mario jumped","generations":[[{"text":" up and said, \\"I\'m so happy!\\"\\n\\nThe little girl smiled and said, \\"I\'m glad you\'re","score":null,"finished":true,"finished_reason":"length"},{"text":" up and said, \\"I\'m so happy!\\"\\n\\nThe little girl smiled and said, \\"I\'m glad you\'re","score":null,"finished":true,"finished_reason":"length"}]],"input_tokens":null}'
Client Code
import requests
from threading import Thread
import time, argparse
import argparse
parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('--num-threads', type=int, default=1)
parser.add_argument('--num-tokens', type=int, default=25)
def main(num_threads=1, num_tokens=25):
url = "http://localhost:5543/v2/models/text_generation-0/infer"
def run(idx, prompt="Mario jumped"):
print(f"launching thread {idx}")
start = time.perf_counter()
obj = {
"prompt": prompt,
"generation_kwargs": {
"max_length": num_tokens
},
"streaming": True,
"num_return_sequences": 2
}
#response = requests.post(url, json=obj)
response = requests.post(url, json=obj, stream=True)
for chunk in response.iter_lines():
if chunk:
print(chunk)
end = time.perf_counter()
print(f"finished thread {idx} : {(end - start): 0.5f}")
ts = [Thread(target=run, args=[idx, "Mario jumped"]) for idx in range(num_threads)]
for t in ts:
t.start()
for t in ts:
t.join()
if __name__ == "__main__":
args = parser.parse_args()
main(num_threads=args.num_threads, num_tokens=args.num_tokens)
# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import openai
from openai import OpenAI
client = OpenAI(base_url="http://localhost:5543/v1", api_key="EMPTY")
models = client.models.list()
model = "hf:mgoin/TinyStories-1M-ds"
print(f"Accessing model API '{model}'")
# Completion API
stream = False
completion = client.chat.completions.create(
messages="The dog",
max_tokens=10,
stream=stream,
model=model,
logprobs=True
)
print(completion)
Output:
ChatCompletion(id='cmpl-57bdfb468783482798577e66c9976f80', choices=[Choice(finish_reason='length', index=None, logprobs=None, message=ChatCompletionMessage(content='\n\n\nOnce upon a time, there was a', role='assistant', function_call=None, tool_calls=None))], created=1705622824, model='hf:mgoin/TinyStories-1M-ds', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=10, prompt_tokens=3, total_tokens=13))
Summary
max_tokens
which was added for a unit test. Updated the mock data for the test insteadTesting
Deepsparse Server
Output (streaming):
Output (non-streaming):
Client Code
Openai
Client Code:
Output:
With Streaming Enabled: