nvidia-holoscan / holohub

Central repository for Holoscan Reference Applications
Apache License 2.0
110 stars 71 forks source link

Can application vila_live be run with multiple camera stream? #603

Closed Shehjad-Ishan closed 2 days ago

Shehjad-Ishan commented 4 days ago

I need to run inferences on multiple camera feeds. With the present pipeline can it be done?

Shehjad-Ishan commented 3 days ago

I have modified the vila_live.py


import base64
import io
import os
import time
from argparse import ArgumentParser
from threading import Event, Thread
from datetime import datetime

import cupy as cp
from PIL import Image, ImageDraw, ImageFont
from holoscan.core import Application, Operator, OperatorSpec
from holoscan.operators import (
    FormatConverterOp,
    HolovizOp,
    V4L2VideoCaptureOp,
    VideoStreamReplayerOp,
)
from holoscan.resources import CudaStreamPool, UnboundedAllocator
from vlm import VLM
from webserver import Webserver

class VLMWebAppOp(Operator):
    """VLM WebApp Operator."""

    def __init__(self, fragment, *args, **kwargs):
        self.server = Webserver()
        self.vlm = VLM()
        self.is_busy = Event()
        self.frame_count = 0
        self.start_time = datetime.now()
        self.frame_rate = 12
        super().__init__(fragment, *args, **kwargs)

    def start(self):
        """Start the web server in a background thread."""
        try:
            self.server.start()
            time.sleep(3)
        except Exception as e:
            print(f"Failed to start web server: {e}")

    def setup(self, spec: OperatorSpec):
        spec.input("video_stream")

    def stop(self):
        """Clean up resources."""
        pass

    def annotate_image(self, image_b64, frame_number, timestamp):
        self.is_busy.set()
        prompt = self.server.user_input.replace('"', "")
        full_response = ""
        current_frame = frame_number

        for response in self.vlm.generate_response(prompt, image_b64):
            if frame_number != current_frame:
                if full_response:
                    chat_history = [
                        [
                            f"Prompt: {prompt}",
                            f"[Frame: {current_frame}, Timestamp: {timestamp}] {full_response}",
                        ]
                    ]
                    self.server.send_chat_history(chat_history)
                full_response = ""
                current_frame = frame_number
            full_response = response

        if full_response:
            chat_history = [
                [
                    f"Prompt: {prompt}",
                    f"[Frame: {current_frame}, Timestamp: {timestamp}] {full_response}",
                ]
            ]
            self.server.send_chat_history(chat_history)

        self.is_busy.clear()

    def compute(self, op_input, op_output, context):
        in_message = op_input.receive("video_stream").get("")
        if in_message:
            self.frame_count += 1
            cp_image = cp.from_dlpack(in_message)
            np_image = cp.asnumpy(cp_image)
            image = Image.fromarray(np_image)

            try:
                font = ImageFont.truetype(
                    "/workspace/holohub/NotoSansKR-Regular.ttf", 36
                )
            except IOError:
                font = ImageFont.load_default()

            draw = ImageDraw.Draw(image)
            draw.text((10, 10), f"Frame: {self.frame_count}", fill=(255, 100, 255), font=font)

            buffer = io.BytesIO()
            image.save(buffer, format="JPEG")
            buffer.seek(0)
            image_b64 = base64.b64encode(buffer.getvalue()).decode()

            if not self.is_busy.is_set():
                timestamp = f"{self.frame_count // 3600:02}:{(self.frame_count % 3600) // 60:02}:{self.frame_count % 60:02}"
                thread = Thread(target=self.annotate_image, args=(image_b64, self.frame_count, timestamp))
                thread.start()

            self.server.send_message({"image_b64": image_b64})

class MultiFeedVLMApp(Application):
    """Application for multi-feed VLM."""

    def __init__(self, data, sources, video_devices):
        super().__init__()
        self.name = "Multi-feed VLM app"
        self.sources = sources
        self.video_devices = video_devices
        self.sample_data_path = data if data != "none" else "/workspace/holohub/data/vila_live"

    def compose(self):
        pool = UnboundedAllocator(self, name="pool")
        formatter_cuda_stream_pool = CudaStreamPool(self, name="formatter_pool", dev_id=0, max_size=5)
        holoviz_cuda_stream_pool = CudaStreamPool(self, name="holoviz_pool", dev_id=0, max_size=5)

        for i, source in enumerate(self.sources):
            source_kwargs = self.kwargs(f"{source}_source_{i}")
            if source == "v4l2":
                device = self.video_devices[i] if i < len(self.video_devices) else f"/dev/video{i}"
                source_kwargs["device"] = device
                source_op = V4L2VideoCaptureOp(self, name=f"v4l2_source_{i}", allocator=pool, **source_kwargs)
                source_output = "signal"
            elif source == "replayer":
                source_op = VideoStreamReplayerOp(self, name=f"replayer_source_{i}", directory=self.sample_data_path, **source_kwargs)
                source_output = "output"

            visualizer = HolovizOp(
                self,
                name=f"holoviz_{i}",
                window_title=f"Feed {i}",
                headless=True,
                allocator=pool,
                cuda_stream_pool=holoviz_cuda_stream_pool,
                tensors=[
                    {"name": f"render_buffer_output_{i}", "type": "color", "opacity": 1.0, "priority": 0}
                ],
            )

            format_converter = FormatConverterOp(
                self,
                name=f"converter_{i}",
                in_dtype="rgba8888",
                out_dtype="rgb888",
                pool=pool,
                cuda_stream_pool=formatter_cuda_stream_pool,
            )

            web_server = VLMWebAppOp(self, name=f"VLMWebAppOp_{i}")

            self.add_flow(source_op, visualizer, {(source_output, "receivers")})
            self.add_flow(visualizer, format_converter, {("render_buffer_output", "source_video")})
            self.add_flow(format_converter, web_server, {("tensor", "video_stream")})

def main():
    parser = ArgumentParser(description="VILA live application with multi-feed support.")
    parser.add_argument("--sources", nargs="+", default=["v4l2"], help="List of input sources (v4l2 or replayer).")
    parser.add_argument("--video_devices", nargs="+", default=["/dev/video2", "/dev/video3"], help="Video devices for v4l2 sources.")
    parser.add_argument("-c", "--config", default="none", help="Set config path to override the default.")
    parser.add_argument("-d", "--data", default="none", help="Set the data path.")
    args = parser.parse_args()

    config_file = args.config if args.config != "none" else os.path.join(os.path.dirname(__file__), "vila_live.yaml")
    app = MultiFeedVLMApp(args.data, args.sources, args.video_devices)
    app.config(config_file)
    app.run()

if __name__ == "__main__":
    main()

and the .yaml config file

v4l2_source_0:
  device: "/dev/video2"

v4l2_source_1:
  device: "/dev/video3"

replayer_source_0:
  #directory: "/path/to/video1"
  frame_rate: 30
  repeat: true
  realtime: true

replayer_source_1:
  #directory: "/path/to/video2"
  frame_rate: 30
  repeat: true
  realtime: true

holoviz_0:
  window_title: "Feed 0"
  tensors:
    - name: "render_buffer_output_0"
      type: color
      opacity: 1.0
      priority: 0

holoviz_1:
  window_title: "Feed 1"
  tensors:
    - name: "render_buffer_output_1"
      type: color
      opacity: 1.0
      priority: 0

Getting this error:

[error] [gxf_wrapper.cpp:84] Exception occurred for operator: 'holoviz_0' - Failed to retrieve input 'render_buffer_output_0'
2024-11-27 12:06:43.235 ERROR gxf/std/entity_executor.cpp@552: Failed to tick codelet holoviz_0 in entity: holoviz_0 code: GXF_FAILURE
2024-11-27 12:06:43.294 WARN  gxf/std/greedy_scheduler.cpp@243: Error while executing entity 12 named 'holoviz_0': GXF_FAILURE
2024-11-27 12:06:43.295 INFO  gxf/std/greedy_scheduler.cpp@401: Scheduler finished.
[error] [program.cpp:574] wait failed. Deactivating...
[error] [runtime.cpp:1476] Graph wait failed with error: GXF_FAILURE
[warning] [gxf_executor.cpp:1947] GXF call GxfGraphWait(context) in line 1947 of file /workspace/holoscan-sdk/src/core/executors/gxf/gxf_executor.cpp failed with 'GXF_FAILURE' (1)
[info] [gxf_executor.cpp:1957] [Multi-feed VLM app] Graph execution finished.
[error] [gxf_executor.cpp:1965] [Multi-feed VLM app] Graph execution error: GXF_FAILURE
Traceback (most recent call last):
  File "/workspace/holohub/applications/vila_live/vila_live.py", line 181, in <module>
    main()
  File "/workspace/holohub/applications/vila_live/vila_live.py", line 177, in main
    app.run()
RuntimeError: Failed to retrieve input 'render_buffer_output_0'
Shehjad-Ishan commented 2 days ago

Hello @NigelNelson. Can you please check out the output?

Shehjad-Ishan commented 2 days ago

I was able to integrate multiple sources.