Closed bhanu1106 closed 1 week ago
To use a local LLM model in a multi-modal RAG setup with Ollama and avoid the "no slots available after 10 retries" error, you can follow these steps to set up a local model using Ollama, specifically using the OllamaMultiModal
class from the llama_index.multi_modal_llms.ollama
module.
Here's a summary of the steps:
Install Ollama: Visit the Ollama homepage to download and install the client for Mac, Linux, or Windows. This client will help you download, install, and run a range of models locally.
Install Required Packages:
!pip install llama-index-llms-ollama
!pip install llama-index-readers-file
!pip install unstructured
!pip install llama-index-embeddings-huggingface
!pip install llama-index-vector-stores-qdrant
!pip install llama-index-embeddings-clip
Pick and Run a Model:
Choose a model that balances power and resource usage. For example, you can run the mixtral 8x7b
model:
ollama run mixtral:8x7b
Import and Initialize the Model:
from llama_index.multi_modal_llms.ollama import OllamaMultiModal
mm_model = OllamaMultiModal(model="mixtral:8x7b")
Build Multi-Modal Index:
Use a local CLIP model to embed images and text, and create a MultiModalVectorStoreIndex
:
from llama_index.core.indices.multi_modal.base import MultiModalVectorStoreIndex
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.embeddings.clip import ClipEmbedding
import qdrant_client
client = qdrant_client.QdrantClient(path="qdrant_mm_db")
text_store = QdrantVectorStore(client=client, collection_name="text_collection")
image_store = QdrantVectorStore(client=client, collection_name="image_collection")
storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
image_embed_model = ClipEmbedding()
documents = SimpleDirectoryReader("./mixed_wiki/").load_data()
index = MultiModalVectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
image_embed_model=image_embed_model,
)
Query the Index:
Use the SimpleMultiModalQueryEngine
to query the index:
from llama_index.core.prompts import PromptTemplate
from llama_index.core.query_engine import SimpleMultiModalQueryEngine
qa_tmpl_str = (
"Context information is below.\n"
"---------------------\n"
"{context_str}\n"
"---------------------\n"
"Given the context information and not prior knowledge, "
"answer the query.\n"
"Query: {query_str}\n"
"Answer: "
)
qa_tmpl = PromptTemplate(qa_tmpl_str)
query_engine = index.as_query_engine(llm=mm_model, text_qa_template=qa_tmpl)
query_str = "Tell me more about the Porsche"
response = query_engine.query(query_str)
By following these steps, you can set up a local multi-modal RAG system using Ollama without relying on external APIs, which should help avoid the "no slots available after 10 retries" error [1].
To continue talking to Dosu, mention @dosu.
Question Validation
Question
I am trying to integrate local llm model in a multi modal rag setup present . But I am getting the following , so how to use local LLM model instead of using apis :
{ "name": "ResponseError", "message": "no slots available after 10 retries", "stack": "--------------------------------------------------------------------------- ResponseError Traceback (most recent call last) Cell In[7], line 15 11 plot_images(img) 13 chat_history_str = memory.load_memory_variables({})['history'] ---> 15 response_1 = ollama_mm_llm.complete( 16 prompt=qa_tmpl_str.format( 17 context_str=context_str, 18 query_str=query_str, 19 metadata_str=metadata_str, 20 chat_history_str=chat_history_str 21 ), 22 image_documents=image_documents, 23 ) 25 print(\"Question : \",query_str) 26 print(\"Answer : \",response_1.text)
File ~/anaconda3/envs/rag/lib/python3.11/site-packages/llama_index/core/instrumentation/dispatcher.py:311, in Dispatcher.span..wrapper(func, instance, args, kwargs)
308 _logger.debug(f\"Failed to reset active_span_id: {e}\")
310 try:
--> 311 result = func(*args, **kwargs)
312 if isinstance(result, asyncio.Future):
313 # If the result is a Future, wrap it
314 new_future = asyncio.ensure_future(result)
File ~/anaconda3/envs/rag/lib/python3.11/site-packages/llama_index/core/llms/callbacks.py:431, in llm_completion_callback..wrap..wrapped_llm_predict(_self, *args, *kwargs)
422 event_id = callback_manager.on_event_start(
423 CBEventType.LLM,
424 payload={
(...)
428 },
429 )
430 try:
--> 431 f_return_val = f(_self, args, **kwargs)
432 except BaseException as e:
433 callback_manager.on_event_end(
434 CBEventType.LLM,
435 payload={EventPayload.EXCEPTION: e},
436 event_id=event_id,
437 )
File ~/anaconda3/envs/rag/lib/python3.11/site-packages/llama_index/multi_modal_llms/ollama/base.py:165, in OllamaMultiModal.complete(self, prompt, image_documents, formatted, kwargs) 157 def complete( 158 self, 159 prompt: str, (...) 162 kwargs: Any, 163 ) -> CompletionResponse: 164 \"\"\"Complete.\"\"\" --> 165 response = self._client.generate( 166 model=self.model, 167 prompt=prompt, 168 images=image_documents_to_base64(image_documents), 169 stream=False, 170 options=self._model_kwargs, 171 **kwargs, 172 ) 173 return CompletionResponse( 174 text=response[\"response\"], 175 raw=response, 176 additional_kwargs=get_additional_kwargs(response, (\"response\",)), 177 )
File ~/anaconda3/envs/rag/lib/python3.11/site-packages/ollama/_client.py:163, in Client.generate(self, model, prompt, suffix, system, template, context, stream, raw, format, images, options, keep_alive) 160 if not model: 161 raise RequestError('must provide a model') --> 163 return self._request_stream( 164 'POST', 165 '/api/generate', 166 json={ 167 'model': model, 168 'prompt': prompt, 169 'suffix': suffix, 170 'system': system, 171 'template': template, 172 'context': context or [], 173 'stream': stream, 174 'raw': raw, 175 'images': [_encode_image(image) for image in images or []], 176 'format': format, 177 'options': options or {}, 178 'keep_alive': keep_alive, 179 }, 180 stream=stream, 181 )
File ~/anaconda3/envs/rag/lib/python3.11/site-packages/ollama/_client.py:99, in Client._request_stream(self, stream, *args, kwargs) 93 def _request_stream( 94 self, 95 *args, 96 stream: bool = False, 97 *kwargs, 98 ) -> Union[Mapping[str, Any], Iterator[Mapping[str, Any]]]: ---> 99 return self._stream(args, kwargs) if stream else self._request(*args, **kwargs).json()
File ~/anaconda3/envs/rag/lib/python3.11/site-packages/ollama/_client.py:75, in Client._request(self, method, url, **kwargs) 73 response.raise_for_status() 74 except httpx.HTTPStatusError as e: ---> 75 raise ResponseError(e.response.text, e.response.status_code) from None 77 return response
ResponseError: no slots available after 10 retries" }