Closed Shehjad-Ishan closed 2 days ago
I have modified the vila_live.py
import base64
import io
import os
import time
from argparse import ArgumentParser
from threading import Event, Thread
from datetime import datetime
import cupy as cp
from PIL import Image, ImageDraw, ImageFont
from holoscan.core import Application, Operator, OperatorSpec
from holoscan.operators import (
FormatConverterOp,
HolovizOp,
V4L2VideoCaptureOp,
VideoStreamReplayerOp,
)
from holoscan.resources import CudaStreamPool, UnboundedAllocator
from vlm import VLM
from webserver import Webserver
class VLMWebAppOp(Operator):
"""VLM WebApp Operator."""
def __init__(self, fragment, *args, **kwargs):
self.server = Webserver()
self.vlm = VLM()
self.is_busy = Event()
self.frame_count = 0
self.start_time = datetime.now()
self.frame_rate = 12
super().__init__(fragment, *args, **kwargs)
def start(self):
"""Start the web server in a background thread."""
try:
self.server.start()
time.sleep(3)
except Exception as e:
print(f"Failed to start web server: {e}")
def setup(self, spec: OperatorSpec):
spec.input("video_stream")
def stop(self):
"""Clean up resources."""
pass
def annotate_image(self, image_b64, frame_number, timestamp):
self.is_busy.set()
prompt = self.server.user_input.replace('"', "")
full_response = ""
current_frame = frame_number
for response in self.vlm.generate_response(prompt, image_b64):
if frame_number != current_frame:
if full_response:
chat_history = [
[
f"Prompt: {prompt}",
f"[Frame: {current_frame}, Timestamp: {timestamp}] {full_response}",
]
]
self.server.send_chat_history(chat_history)
full_response = ""
current_frame = frame_number
full_response = response
if full_response:
chat_history = [
[
f"Prompt: {prompt}",
f"[Frame: {current_frame}, Timestamp: {timestamp}] {full_response}",
]
]
self.server.send_chat_history(chat_history)
self.is_busy.clear()
def compute(self, op_input, op_output, context):
in_message = op_input.receive("video_stream").get("")
if in_message:
self.frame_count += 1
cp_image = cp.from_dlpack(in_message)
np_image = cp.asnumpy(cp_image)
image = Image.fromarray(np_image)
try:
font = ImageFont.truetype(
"/workspace/holohub/NotoSansKR-Regular.ttf", 36
)
except IOError:
font = ImageFont.load_default()
draw = ImageDraw.Draw(image)
draw.text((10, 10), f"Frame: {self.frame_count}", fill=(255, 100, 255), font=font)
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
buffer.seek(0)
image_b64 = base64.b64encode(buffer.getvalue()).decode()
if not self.is_busy.is_set():
timestamp = f"{self.frame_count // 3600:02}:{(self.frame_count % 3600) // 60:02}:{self.frame_count % 60:02}"
thread = Thread(target=self.annotate_image, args=(image_b64, self.frame_count, timestamp))
thread.start()
self.server.send_message({"image_b64": image_b64})
class MultiFeedVLMApp(Application):
"""Application for multi-feed VLM."""
def __init__(self, data, sources, video_devices):
super().__init__()
self.name = "Multi-feed VLM app"
self.sources = sources
self.video_devices = video_devices
self.sample_data_path = data if data != "none" else "/workspace/holohub/data/vila_live"
def compose(self):
pool = UnboundedAllocator(self, name="pool")
formatter_cuda_stream_pool = CudaStreamPool(self, name="formatter_pool", dev_id=0, max_size=5)
holoviz_cuda_stream_pool = CudaStreamPool(self, name="holoviz_pool", dev_id=0, max_size=5)
for i, source in enumerate(self.sources):
source_kwargs = self.kwargs(f"{source}_source_{i}")
if source == "v4l2":
device = self.video_devices[i] if i < len(self.video_devices) else f"/dev/video{i}"
source_kwargs["device"] = device
source_op = V4L2VideoCaptureOp(self, name=f"v4l2_source_{i}", allocator=pool, **source_kwargs)
source_output = "signal"
elif source == "replayer":
source_op = VideoStreamReplayerOp(self, name=f"replayer_source_{i}", directory=self.sample_data_path, **source_kwargs)
source_output = "output"
visualizer = HolovizOp(
self,
name=f"holoviz_{i}",
window_title=f"Feed {i}",
headless=True,
allocator=pool,
cuda_stream_pool=holoviz_cuda_stream_pool,
tensors=[
{"name": f"render_buffer_output_{i}", "type": "color", "opacity": 1.0, "priority": 0}
],
)
format_converter = FormatConverterOp(
self,
name=f"converter_{i}",
in_dtype="rgba8888",
out_dtype="rgb888",
pool=pool,
cuda_stream_pool=formatter_cuda_stream_pool,
)
web_server = VLMWebAppOp(self, name=f"VLMWebAppOp_{i}")
self.add_flow(source_op, visualizer, {(source_output, "receivers")})
self.add_flow(visualizer, format_converter, {("render_buffer_output", "source_video")})
self.add_flow(format_converter, web_server, {("tensor", "video_stream")})
def main():
parser = ArgumentParser(description="VILA live application with multi-feed support.")
parser.add_argument("--sources", nargs="+", default=["v4l2"], help="List of input sources (v4l2 or replayer).")
parser.add_argument("--video_devices", nargs="+", default=["/dev/video2", "/dev/video3"], help="Video devices for v4l2 sources.")
parser.add_argument("-c", "--config", default="none", help="Set config path to override the default.")
parser.add_argument("-d", "--data", default="none", help="Set the data path.")
args = parser.parse_args()
config_file = args.config if args.config != "none" else os.path.join(os.path.dirname(__file__), "vila_live.yaml")
app = MultiFeedVLMApp(args.data, args.sources, args.video_devices)
app.config(config_file)
app.run()
if __name__ == "__main__":
main()
and the .yaml config file
v4l2_source_0:
device: "/dev/video2"
v4l2_source_1:
device: "/dev/video3"
replayer_source_0:
#directory: "/path/to/video1"
frame_rate: 30
repeat: true
realtime: true
replayer_source_1:
#directory: "/path/to/video2"
frame_rate: 30
repeat: true
realtime: true
holoviz_0:
window_title: "Feed 0"
tensors:
- name: "render_buffer_output_0"
type: color
opacity: 1.0
priority: 0
holoviz_1:
window_title: "Feed 1"
tensors:
- name: "render_buffer_output_1"
type: color
opacity: 1.0
priority: 0
Getting this error:
[error] [gxf_wrapper.cpp:84] Exception occurred for operator: 'holoviz_0' - Failed to retrieve input 'render_buffer_output_0'
2024-11-27 12:06:43.235 ERROR gxf/std/entity_executor.cpp@552: Failed to tick codelet holoviz_0 in entity: holoviz_0 code: GXF_FAILURE
2024-11-27 12:06:43.294 WARN gxf/std/greedy_scheduler.cpp@243: Error while executing entity 12 named 'holoviz_0': GXF_FAILURE
2024-11-27 12:06:43.295 INFO gxf/std/greedy_scheduler.cpp@401: Scheduler finished.
[error] [program.cpp:574] wait failed. Deactivating...
[error] [runtime.cpp:1476] Graph wait failed with error: GXF_FAILURE
[warning] [gxf_executor.cpp:1947] GXF call GxfGraphWait(context) in line 1947 of file /workspace/holoscan-sdk/src/core/executors/gxf/gxf_executor.cpp failed with 'GXF_FAILURE' (1)
[info] [gxf_executor.cpp:1957] [Multi-feed VLM app] Graph execution finished.
[error] [gxf_executor.cpp:1965] [Multi-feed VLM app] Graph execution error: GXF_FAILURE
Traceback (most recent call last):
File "/workspace/holohub/applications/vila_live/vila_live.py", line 181, in <module>
main()
File "/workspace/holohub/applications/vila_live/vila_live.py", line 177, in main
app.run()
RuntimeError: Failed to retrieve input 'render_buffer_output_0'
Hello @NigelNelson. Can you please check out the output?
I was able to integrate multiple sources.
I need to run inferences on multiple camera feeds. With the present pipeline can it be done?