Closed MShahnoor closed 1 year ago
I think for video data you can inference grounding-dino for each frame and collect the results
Yep it works for videos! Just implement it like you would for YOLO
Yep it works for videos! Just implement it like you would for YOLO
For YOLOv8 you only have to provide a path for video instead of image and I haven't used older versions of YOLO so I'm not sure how they work. Grounding Dino expects an image.
I think for video data you can inference grounding-dino for each frame and collect the results
I'm trying to infer each frame in a video but it is not giving any output. I don't understand why it doesn't works because same code is working fine with single image. Here's the code that I'm using for video inferencing:
`import cv2 import torch from google.colab.patches import cv2_imshow
TEXT_PROMPT = "umpire" BOX_TRESHOLD = 0.3 TEXT_TRESHOLD = 0.3
video_path = "/content/drive/MyDrive/CV/cricket_segmentation/cricket_segmentation/vid2/test8.mp4"
cap = cv2.VideoCapture(video_path)
while cap.isOpened(): ret, frame = cap.read() if not ret: break
frame_tensor = torch.from_numpy(frame).permute(2, 0, 1).float()
boxes, logits, phrases = predict(
model=groundingdino_model,
image=frame_tensor,
caption=TEXT_PROMPT,
box_threshold=BOX_TRESHOLD,
text_threshold=TEXT_TRESHOLD
)
print(boxes)
print(logits)
print(phrases)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release() cv2.destroyAllWindows() ` Following is the output I get:
tensor([], size=(0, 4)) tensor([]) [] tensor([], size=(0, 4)) tensor([]) [] tensor([], size=(0, 4)) tensor([]) [] tensor([], size=(0, 4)) tensor([]) [] tensor([], size=(0, 4)) tensor([]) [] tensor([], size=(0, 4)) tensor([]) [] tensor([], size=(0, 4)) tensor([]) []
And here's the code for image which is working fine:
`TEXT_PROMPT = "umpire" BOX_TRESHOLD = 0.3 TEXT_TRESHOLD = 0.3
image_source, image = load_image(local_image_path)
boxes, logits, phrases = predict( model=groundingdino_model, image=image, caption=TEXT_PROMPT, box_threshold=BOX_TRESHOLD, text_threshold=TEXT_TRESHOLD )
print(boxes) print(logits) print(phrases)`
Output for image inferencing:
tensor([[0.5042, 0.8128, 0.0676, 0.3667]]) tensor([0.7949]) ['umpire']
Solved this issue by inferring each frame from the video and then combining it back. Here's the code:
def preprocess_image(image_bgr: np.ndarray) -> torch.Tensor:
transform = T.Compose(
[
T.RandomResize([800], max_size=1333),
T.ToTensor(),
T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
]
)
image_pillow = Image.fromarray(cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB))
image_transformed, _ = transform(image_pillow, None)
return image_transformed
import cv2
import torch
import groundingdino.datasets.transforms as T
import numpy as np
TEXT_PROMPT = "cricket track, ball, players, umpire"
BOX_THRESHOLD = 0.2
TEXT_THRESHOLD = 0.2
# Replace with the path to your existing video file
video_path = "/content/drive/MyDrive/CV/cricket_segmentation/cricket_segmentation/vid2/test1.mp4"
# Replace with the desired output video file path
output_video_path = "/content/drive/MyDrive/CV/cricket_segmentation/cricket_segmentation/vid2/annotated_video1.mp4"
cap = cv2.VideoCapture(video_path)
# Get the frame width and height
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# Define the codec and create VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, 30.0, (frame_width, frame_height))
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# preprocess image
transformed_image = preprocess_image(frame)
# Perform object detection on the current frame
boxes, logits, phrases = predict(
model=groundingdino_model,
image=transformed_image,
caption=TEXT_PROMPT,
box_threshold=BOX_THRESHOLD,
text_threshold=TEXT_THRESHOLD
)
# Annotate the frame
annotated_frame = annotate(image_source=frame, boxes=boxes, logits=logits, phrases=phrases)
annotated_frame = annotated_frame[...,::-1] # BGR to RGB
# Write the annotated frame to the output video
out.write(annotated_frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
out.release()
cv2.destroyAllWindows()
Hi, currently Grounding Dino is working for images only. Using it for videos would be very helpful. I'm wondering if there is a workaround?