import asyncio
from ollama import AsyncClient
async def run_query_async(model, query, instance_id):
try:
client = AsyncClient()
print(f"\nInstance {instance_id}: Starting query")
stream = await client.chat(
model=model,
messages=[{'role': 'user', 'content': query}],
stream=True
)
async for chunk in stream:
content = chunk['message']['content']
if content:
print(f"Instance {instance_id}: {content}", end='', flush=True)
print(f"\nInstance {instance_id}: [DONE]")
except Exception as e:
print(f"\nInstance {instance_id}: Error - {str(e)}")
async def add_query(model, query, instance_id):
task = asyncio.create_task(run_query_async(model, query, instance_id))
print(f"\nAdded new task: Instance {instance_id}")
return task
async def main():
model = "phi3" # Replace with your desired model
queries = [
"Write a step-by-step guide on how to bake a chocolate cake from scratch.",
"Develop a python function that solves the following problem, sudoku game",
"Create a dialogue between two characters that discusses economic crisis",
"In a forest, there are brave lions living there. Please continue the story."
]
# Start with 2 initial queries
tasks = [
await add_query(model, queries[0], 1),
await add_query(model, queries[1], 2),
await add_query(model, queries[3], 4)
]
# Wait for a bit before adding more queries
await asyncio.sleep(5)
# Add 2 more queries
tasks.append(await add_query(model, queries[2], 3))
# Wait for all tasks to complete
await asyncio.gather(*tasks)
if __name__ == "__main__":
asyncio.run(main())
I have set my environment variable OLLAMA_NUM_PARALLEL=4 so it should be able to handle all instances at the same time. But for some reason it only processes 2 and waits for one to finish to process the next one.
Can it also be that my system is just not able to process more than 2 instances? Or is there something I am missing?
This is my testing script:
I have set my environment variable OLLAMA_NUM_PARALLEL=4 so it should be able to handle all instances at the same time. But for some reason it only processes 2 and waits for one to finish to process the next one.
Can it also be that my system is just not able to process more than 2 instances? Or is there something I am missing?