run-llama / llama_index

LlamaIndex is a data framework for your LLM applications
https://docs.llamaindex.ai
MIT License
35.13k stars 4.93k forks source link

[Question]: I want to buid a local multimodal RAG chatbot, but I get error below #13943

Open LJ-Hao opened 2 months ago

LJ-Hao commented 2 months ago

Question Validation

Question

# This class will transform video to text and images
class VideoProcessor:
    def __init__(self, video_path, output_audio_path, output_folder):
        self.video_path = video_path
        self.output_audio_path = output_audio_path
        self.image_path = output_folder
        self.text_path = output_folder

    def extract_audio(self):
        video = VideoFileClip(os.path.join(self.video_path, "input.mp4"))
        audio_part = video.audio
        audio_part.write_audiofile(os.path.join(self.output_audio_path, "output_audio.mp3"))

    def extract_text(self):
        model = whisper.load_model("base")
        result = model.transcribe(os.path.join(self.output_audio_path, "output_audio.mp3"))
        with open(os.path.join(self.text_path, "audio.md"), "w") as file:
            file.write(result['text'])
            file.close()

    def extract_frames(self):
            clip = VideoFileClip(os.path.join(self.video_path, "input.mp4"))
            clip.write_images_sequence(os.path.join(self.image_path, "frame%04d.png"), fps=0.2)

    def process_video(self):
        self.extract_audio()
        self.extract_text()
        self.extract_frames()

# This class maker sure RAG and ask
class retriever_ask:
    def __init__(self, output_folder, similarity_top_k, image_similarity_top_k) -> None:
        self.text_store = LanceDBVectorStore(uri="lancedb", table_name="text_collection")
        self.image_store = LanceDBVectorStore(uri="lancedb", table_name="image_collection")
        self.output_folder = output_folder
        self.similarity_top_k = similarity_top_k
        self.image_similarity_top_k = image_similarity_top_k
        # self.llm = Ollama(model='llava', request_timeout=100)
        self.llm = OllamaMultiModal(model='llava', request_timeout=3600) 
        self.qa_tmpl_str = (
    """
 Given the provided information, including relevant images and retrieved context from the video,\
 accurately and precisely answer the query without any additional prior knowledge.\n"
    "Please ensure honesty and responsibility, refraining from any racist or sexist remarks.\n"
    "---------------------\n"
    "Context: {context_str}\n"
    "Metadata for video: {metadata_str} \n"
    "---------------------\n"
    "Query: {query_str}\n"
    "Answer: "
"""
)
    def retriever_engine(self):
        documents = SimpleDirectoryReader(self.output_folder).load_data()
        storage_context = StorageContext.from_defaults(vector_store=self.text_store, image_store=self.image_store)
        index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)
        retriever_engine = index.as_retriever(similarity_top_k=self.similarity_top_k, image_similarity_top_k=self.image_similarity_top_k)
        return retriever_engine

    def retrieve(self, retriever_engine, query_str):
        retrieval_results = retriever_engine.retrieve(query_str)
        retrieved_image = []
        retrieved_text = []
        for res_node in retrieval_results:
            if isinstance(res_node.node, ImageNode):
                retrieved_image.append(res_node.node.metadata["file_path"])
            else:
                retrieved_text.append(res_node.text)

        return retrieved_image, retrieved_text

    def ask(self):
        retriever_engine = self.retriever_engine()

        while True:
            try:
                query_str = input('\033[94m' +"Prompt: " + '\033[0m')
                image, text = self.retrieve(retriever_engine=retriever_engine, query_str=query_str)
                context_str = "".join(text)
                metadata_str= image
                image_documents = SimpleDirectoryReader(input_dir=self.output_folder, input_files=image).load_data()
                response = self.llm.complete(prompt=self.qa_tmpl_str.format(context_str=context_str, query_str=query_str, metadata_str=metadata_str),image_documents=image_documents)
                print(response)
            except KeyboardInterrupt:
                break

if __name__ == '__main__':

    Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

    video_path = './video/'
    output_audio_path = './audio/'
    output_folder= './output/'
# process video to images and text
    processor = VideoProcessor(video_path, output_audio_path, output_folder)
    processor.process_video()
# Ask question
    engine = retriever_ask(output_folder, 5, 5)
    engine.ask()

But I get problem like below:

Prompt: hello
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/httpx/_transports/default.py", line 69, in map_httpcore_exceptions
    yield
  File "/usr/local/lib/python3.8/dist-packages/httpx/_transports/default.py", line 233, in handle_request
    resp = self._pool.handle_request(req)
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_sync/connection_pool.py", line 216, in handle_request
    raise exc from None
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_sync/connection_pool.py", line 196, in handle_request
    response = connection.handle_request(
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_sync/connection.py", line 101, in handle_request
    return self._connection.handle_request(request)
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_sync/http11.py", line 143, in handle_request
    raise exc
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_sync/http11.py", line 113, in handle_request
    ) = self._receive_response_headers(**kwargs)
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_sync/http11.py", line 186, in _receive_response_headers
    event = self._receive_event(timeout=timeout)
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_sync/http11.py", line 224, in _receive_event
    data = self._network_stream.read(
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_backends/sync.py", line 126, in read
    return self._sock.recv(max_bytes)
  File "/usr/lib/python3.8/contextlib.py", line 131, in __exit__
    self.gen.throw(type, value, traceback)
  File "/usr/local/lib/python3.8/dist-packages/httpcore/_exceptions.py", line 14, in map_exceptions
    raise to_exc(exc) from exc
httpcore.ReadTimeout: timed out

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "./mulitRAG.py", line 153, in <module>
    engine.ask()
  File "./mulitRAG.py", line 135, in ask
    response = self.llm.complete(prompt=self.qa_tmpl_str.format(context_str=context_str, query_str=query_str, metadata_str=metadata_str),image_documents=image_documents)
  File "/usr/local/lib/python3.8/dist-packages/llama_index/multi_modal_llms/ollama/base.py", line 160, in complete
    response = self._client.generate(
  File "/usr/local/lib/python3.8/dist-packages/ollama/_client.py", line 126, in generate
    return self._request_stream(
  File "/usr/local/lib/python3.8/dist-packages/ollama/_client.py", line 97, in _request_stream
    return self._stream(*args, **kwargs) if stream else self._request(*args, **kwargs).json()
  File "/usr/local/lib/python3.8/dist-packages/ollama/_client.py", line 68, in _request
    response = self._client.request(method, url, **kwargs)
  File "/usr/local/lib/python3.8/dist-packages/httpx/_client.py", line 827, in request
    return self.send(request, auth=auth, follow_redirects=follow_redirects)
  File "/usr/local/lib/python3.8/dist-packages/httpx/_client.py", line 914, in send
    response = self._send_handling_auth(
  File "/usr/local/lib/python3.8/dist-packages/httpx/_client.py", line 942, in _send_handling_auth
    response = self._send_handling_redirects(
  File "/usr/local/lib/python3.8/dist-packages/httpx/_client.py", line 979, in _send_handling_redirects
    response = self._send_single_request(request)
  File "/usr/local/lib/python3.8/dist-packages/httpx/_client.py", line 1015, in _send_single_request
    response = transport.handle_request(request)
  File "/usr/local/lib/python3.8/dist-packages/httpx/_transports/default.py", line 233, in handle_request
    resp = self._pool.handle_request(req)
  File "/usr/lib/python3.8/contextlib.py", line 131, in __exit__
    self.gen.throw(type, value, traceback)
  File "/usr/local/lib/python3.8/dist-packages/httpx/_transports/default.py", line 86, in map_httpcore_exceptions
    raise mapped_exc(message) from exc
httpx.ReadTimeout: timed out

I am sure that my computer is inferencing my question, but I don’t get a response to the question.

dosubot[bot] commented 2 months ago

To resolve the timeout issue when running your local multimodal RAG chatbot code, you can set the request_timeout parameter in the OllamaMultiModal class. This parameter specifies the timeout for making HTTP requests to the Ollama API server.

Here's how you can set the request_timeout parameter:

ollama_model = OllamaMultiModal(
    base_url="http://localhost:11434",
    model="llava",
    temperature=0.75,
    context_window=DEFAULT_CONTEXT_WINDOW,
    request_timeout=3600,  # Set the timeout to 3600 seconds (1 hour) or any other value you prefer
    additional_kwargs={}
)

In your code, you have already set the request_timeout to 3600 seconds, which should be sufficient for most cases. However, if you are still experiencing timeouts, consider the following additional steps:

  1. Check Network Stability: Ensure that your network connection is stable and that there are no interruptions during the request.

  2. Optimize Video Processing: Large video files can take a significant amount of time to process. Ensure that your video processing steps (audio extraction, text extraction, and frame extraction) are optimized for performance.

  3. Increase Timeout Further: If necessary, you can increase the request_timeout value further to accommodate longer processing times.

  4. Debugging and Logging: Enable detailed logging to diagnose where the timeout is occurring. You can enable debug logging in your application as follows:

    import logging
    import sys
    
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
  5. Asynchronous Operations: If possible, consider using asynchronous operations to handle long-running tasks without blocking the main thread.

By following these steps, you should be able to diagnose and resolve the timeout issue in your multimodal RAG chatbot code [1].

To continue talking to Dosu, mention @dosu.

LJ-Hao commented 2 months ago

So someone helps, did I use some function with a wrong way?

dosubot[bot] commented 2 months ago

To resolve the timeout issue in your local multimodal RAG chatbot code, ensure that the request_timeout parameter is correctly passed to the Client initialization. The provided code already sets the timeout parameter in the Client initialization using the request_timeout value.

Here is the relevant part of the code:

class OllamaMultiModal(MultiModalLLM):
    # ... other fields and methods ...

    def __init__(self, **kwargs: Any) -> None:
        """Init params."""
        super().__init__(**kwargs)
        self._client = Client(host=self.base_url, timeout=self.request_timeout)

If you are still experiencing timeout issues, consider checking the following:

  1. Network Issues: Ensure there are no network issues causing the timeout.
  2. Server Performance: Verify that the server hosting the model can handle the requests within the specified timeout.
  3. Client-Side Configuration: Double-check that the request_timeout is correctly set when initializing the OllamaMultiModal instance.

Additionally, ensure that the ollama library is correctly installed and imported, as shown in the OllamaMultiModal class:

def __init__(self, **kwargs: Any) -> None:
    """Init params."""
    # make sure that ollama is installed
    try:
        import ollama  # noqa: F401
    except ImportError:
        raise ImportError(
            "Ollama is not installed. Please install it using `pip install ollama`."
        )
    super().__init__(**kwargs)

If all configurations are correct and the issue persists, you might need to investigate further into the server logs or network conditions to identify the root cause of the timeout.

Additionally, ensure that the httpx.Client is being used correctly with the Timeout object, as shown in the Ollama class from the llama-index library:

with httpx.Client(timeout=Timeout(self.request_timeout)) as client:
    response = client.post(
        url=f"{self.base_url}/api/chat",
        json=payload,
    )
    response.raise_for_status()
    raw = response.json()
    message = raw["message"]
    return ChatResponse(
        message=ChatMessage(
            content=message.get("content"),
            role=MessageRole(message.get("role")),
            additional_kwargs=get_additional_kwargs(
                message, ("content", "role")
            ),
        ),
        raw=raw,
        additional_kwargs=get_additional_kwargs(raw, ("message",)),
    )

Ensure that the Timeout object is correctly instantiated and passed to the httpx.Client to handle the request timeout properly [1][2][3].

To continue talking to Dosu, mention @dosu.

LJ-Hao commented 2 months ago

Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5") Hi, I want use clip to embedding my images and text, how can I use it, could you give me an example.

dosubot[bot] commented 2 months ago

To switch to using CLIP to embed your images and text in your local multimodal RAG chatbot, you can use the ClipEmbedding class provided in the llama_index/embeddings/clip/base.py file. Here is a step-by-step guide:

  1. Install the required packages:

    pip install git+https://github.com/openai/CLIP.git
    pip install torch pillow
  2. Initialize the ClipEmbedding class:

    from llama_index.embeddings.clip import ClipEmbedding
    
    clip_embedding = ClipEmbedding(
       embed_batch_size=10,  # You can adjust the batch size as needed
       model_name="ViT-B/32"  # You can choose from the available CLIP models
    )
  3. Generate text embeddings:

    text = "Your text here"
    text_embedding = clip_embedding._get_text_embedding(text)
  4. Generate image embeddings:

    img_file_path = "path/to/your/image.jpg"
    image_embedding = await clip_embedding._aget_image_embedding(img_file_path)

Here is a complete example:

import asyncio
from llama_index.embeddings.clip import ClipEmbedding

# Initialize the ClipEmbedding class
clip_embedding = ClipEmbedding(
    embed_batch_size=10,  # Adjust the batch size as needed
    model_name="ViT-B/32"  # Choose from the available CLIP models
)

# Generate text embedding
text = "Your text here"
text_embedding = clip_embedding._get_text_embedding(text)
print("Text Embedding:", text_embedding)

# Generate image embedding
async def get_image_embedding():
    img_file_path = "path/to/your/image.jpg"
    image_embedding = await clip_embedding._aget_image_embedding(img_file_path)
    print("Image Embedding:", image_embedding)

# Run the async function to get image embedding
asyncio.run(get_image_embedding())

This code will initialize the CLIP model, generate embeddings for the provided text and image, and print the embeddings [1][2].

To continue talking to Dosu, mention @dosu.

LJ-Hao commented 2 months ago

To switch to using CLIP to embed your images and text in your local multimodal RAG chatbot, you can use the ClipEmbedding class provided in the llama_index/embeddings/clip/base.py file. Here is a step-by-step guide:

  1. Install the required packages:
    pip install git+https://github.com/openai/CLIP.git
    pip install torch pillow
  2. Initialize the ClipEmbedding class:

    from llama_index.embeddings.clip import ClipEmbedding
    
    clip_embedding = ClipEmbedding(
       embed_batch_size=10,  # You can adjust the batch size as needed
       model_name="ViT-B/32"  # You can choose from the available CLIP models
    )
  3. Generate text embeddings:
    text = "Your text here"
    text_embedding = clip_embedding._get_text_embedding(text)
  4. Generate image embeddings:
    img_file_path = "path/to/your/image.jpg"
    image_embedding = await clip_embedding._aget_image_embedding(img_file_path)

Here is a complete example:

import asyncio
from llama_index.embeddings.clip import ClipEmbedding

# Initialize the ClipEmbedding class
clip_embedding = ClipEmbedding(
    embed_batch_size=10,  # Adjust the batch size as needed
    model_name="ViT-B/32"  # Choose from the available CLIP models
)

# Generate text embedding
text = "Your text here"
text_embedding = clip_embedding._get_text_embedding(text)
print("Text Embedding:", text_embedding)

# Generate image embedding
async def get_image_embedding():
    img_file_path = "path/to/your/image.jpg"
    image_embedding = await clip_embedding._aget_image_embedding(img_file_path)
    print("Image Embedding:", image_embedding)

# Run the async function to get image embedding
asyncio.run(get_image_embedding())

This code will initialize the CLIP model, generate embeddings for the provided text and image, and print the embeddings [1][2].

To continue talking to Dosu, mention @dosu.

And use clip with LanceDBVectorStore, give me an example

dosubot[bot] commented 2 months ago

To replace the current embedding model with CLIP and use it with LanceDBVectorStore in your local multimodal RAG chatbot, follow these steps:

  1. Install the required packages:

    pip install git+https://github.com/openai/CLIP.git
    pip install torch pillow
  2. Define the ClipEmbedding class: Use the provided ClipEmbedding class to generate embeddings using the CLIP model. Here is the complete class definition:

    import logging
    from typing import Any, List
    from llama_index.core.bridge.pydantic import Field, PrivateAttr
    from llama_index.core.constants import DEFAULT_EMBED_BATCH_SIZE
    from llama_index.core.embeddings.multi_modal_base import MultiModalEmbedding
    from llama_index.core.schema import ImageType
    from PIL import Image
    import os
    
    logger = logging.getLogger(__name__)
    
    AVAILABLE_CLIP_MODELS = (
       "RN50",
       "RN101",
       "RN50x4",
       "RN50x16",
       "RN50x64",
       "ViT-B/32",
       "ViT-B/16",
       "ViT-L/14",
       "ViT-L/14@336px",
    )
    DEFAULT_CLIP_MODEL = "ViT-B/32"
    
    class ClipEmbedding(MultiModalEmbedding):
       """CLIP embedding models for encoding text and image for Multi-Modal purpose.
    
       This class provides an interface to generate embeddings using a model
       deployed in OpenAI CLIP. At the initialization it requires a model name
       of CLIP.
    
       Note:
           Requires `clip` package to be available in the PYTHONPATH. It can be installed with
           `pip install git+https://github.com/openai/CLIP.git`.
       """
    
       embed_batch_size: int = Field(default=DEFAULT_EMBED_BATCH_SIZE, gt=0)
    
       _clip: Any = PrivateAttr()
       _model: Any = PrivateAttr()
       _preprocess: Any = PrivateAttr()
       _device: Any = PrivateAttr()
    
       @classmethod
       def class_name(cls) -> str:
           return "ClipEmbedding"
    
       def __init__(
           self,
           *,
           embed_batch_size: int = DEFAULT_EMBED_BATCH_SIZE,
           model_name: str = DEFAULT_CLIP_MODEL,
           **kwargs: Any,
       ):
           """Initializes the ClipEmbedding class.
    
           During the initialization the `clip` package is imported.
    
           Args:
               embed_batch_size (int, optional): The batch size for embedding generation. Defaults to 10,
                   must be > 0 and <= 100.
               model_name (str): The model name of Clip model.
    
           Raises:
               ImportError: If the `clip` package is not available in the PYTHONPATH.
               ValueError: If the model cannot be fetched from Open AI. or if the embed_batch_size
                   is not in the range (0, 100].
           """
           if embed_batch_size <= 0:
               raise ValueError(f"Embed batch size {embed_batch_size}  must be > 0.")
    
           try:
               import clip
               import torch
           except ImportError:
               raise ImportError(
                   "ClipEmbedding requires `pip install git+https://github.com/openai/CLIP.git` and torch."
               )
    
           super().__init__(
               embed_batch_size=embed_batch_size, model_name=model_name, **kwargs
           )
    
           try:
               self._device = "cuda" if torch.cuda.is_available() else "cpu"
               is_local_path = os.path.exists(self.model_name)
               if not is_local_path and self.model_name not in AVAILABLE_CLIP_MODELS:
                   raise ValueError(
                       f"Model name {self.model_name} is not available in CLIP."
                   )
               self._model, self._preprocess = clip.load(
                   self.model_name, device=self._device
               )
    
           except Exception as e:
               logger.error("Error while loading clip model.")
               raise ValueError("Unable to fetch the requested embeddings model") from e
    
       # TEXT EMBEDDINGS
    
       async def _aget_query_embedding(self, query: str) -> Embedding:
           return self._get_query_embedding(query)
    
       def _get_text_embedding(self, text: str) -> Embedding:
           return self._get_text_embeddings([text])[0]
    
       def _get_text_embeddings(self, texts: List[str]) -> List<Embedding]:
           results = []
           for text in texts:
               try:
                   import clip
               except ImportError:
                   raise ImportError(
                       "ClipEmbedding requires `pip install git+https://github.com/openai/CLIP.git` and torch."
                   )
               text_embedding = self._model.encode_text(
                   clip.tokenize(text).to(self._device)
               )
               results.append(text_embedding.tolist()[0])
    
           return results
    
       def _get_query_embedding(self, query: str) -> Embedding:
           return self._get_text_embedding(query)
    
       # IMAGE EMBEDDINGS
    
       async def _aget_image_embedding(self, img_file_path: ImageType) -> Embedding:
           return self._get_image_embedding(img_file_path)
    
       def _get_image_embedding(self, img_file_path: ImageType) -> Embedding:
           import torch
           with torch.no_grad():
               image = (
                   self._preprocess(Image.open(img_file_path))
                   .unsqueeze(0)
                   .to(self._device)
               )
               return self._model.encode_image(image).tolist()[0]
  3. Integrate with LanceDBVectorStore: Assuming you have a LanceDBVectorStore class, you need to integrate the ClipEmbedding class with it. Here is a simplified example of how you might do this:

    from lance_db_vector_store import LanceDBVectorStore  # Assuming you have this class
    from llama_index.embeddings.clip import ClipEmbedding  # Importing ClipEmbedding from llama_index
    
    # Initialize the ClipEmbedding
    clip_embedding = ClipEmbedding(model_name="ViT-B/32")
    
    # Create an instance of LanceDBVectorStore with the ClipEmbedding
    vector_store = LanceDBVectorStore(embedding_model=clip_embedding)
    
    # Example usage
    text = "A sample text to encode"
    text_embedding = clip_embedding._get_text_embedding(text)
    vector_store.add_embedding(text, text_embedding)
    
    # For image embeddings
    image_path = "path/to/your/image.jpg"
    image_embedding = clip_embedding._get_image_embedding(image_path)
    vector_store.add_embedding(image_path, image_embedding)

This code demonstrates how to initialize the ClipEmbedding class, generate text and image embeddings, and add them to the LanceDBVectorStore. Adjust the integration according to your specific implementation of LanceDBVectorStore [1][2][3].

To continue talking to Dosu, mention @dosu.