I adapted the run_video.py you provide to integrate metric_depth. Here is the code I use. I mainly add some logs, provide a way to specify an output fps and add text on output with depth output.
import argparse
import cv2
import glob
import matplotlib
import numpy as np
import os
import torch
import timeit
from depth_anything_v2.dpt import DepthAnythingV2
def create_text(true_depth, image, frame_width, frame_height):
steps = 20
for y in range(1, steps):
for x in range(1, steps):
px_h = int(y * frame_height/steps)
px_w = int(x * frame_width/steps)
# Format the depth value (you can customize precision here)
label_text = f'{true_depth[px_h, px_w]:.1f}'
# Draw the text on the gradient image
font = cv2.FONT_HERSHEY_SIMPLEX
font_scale = 0.4
font_thickness = 1
text_color = (0, 0, 0) # Black text
text_position = (px_w, px_h) # Position to the right of the color bar
cv2.circle(image, (px_w, px_h), radius=5, color=(0, 0, 255), thickness=-1)
cv2.putText(image, label_text, text_position, font, font_scale, text_color, font_thickness, cv2.LINE_AA)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation')
parser.add_argument('--video-path', type=str)
parser.add_argument('--input-size', type=int, default=518)
parser.add_argument('--outdir', type=str, default='./vis_video_depth')
parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth')
parser.add_argument('--max-depth', type=float, default=80)
parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')
args = parser.parse_args()
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
print(DEVICE)
model_configs = {
'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
}
depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
depth_anything = depth_anything.to(DEVICE).eval()
if os.path.isfile(args.video_path):
if args.video_path.endswith('txt'):
with open(args.video_path, 'r') as f:
lines = f.read().splitlines()
else:
filenames = [args.video_path]
else:
filenames = glob.glob(os.path.join(args.video_path, '**/*'), recursive=True)
os.makedirs(args.outdir, exist_ok=True)
margin_width = 50
cmap = matplotlib.colormaps.get_cmap('Spectral')
for k, filename in enumerate(filenames):
start = timeit.default_timer()
print(f'Progress {k+1}/{len(filenames)}: {filename}')
raw_video = cv2.VideoCapture(filename)
frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
print(f"width : {frame_width} - height : {frame_height} - rate : {frame_rate}")
if args.pred_only:
output_width = frame_width
else:
output_width = frame_width * 2 + margin_width
output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.mp4')
out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height))
desired_output_fps = 1.0
frame_count = 0
while raw_video.isOpened():
ret, raw_frame = raw_video.read()
if not ret:
break
if frame_count % (frame_rate / desired_output_fps) == 0:
true_depth = depth_anything.infer_image(raw_frame, args.input_size)
depth = (true_depth - true_depth.min()) / (true_depth.max() - true_depth.min()) * 255.0
depth = depth.astype(np.uint8)
if args.grayscale:
depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
else:
depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
if args.pred_only:
out.write(depth)
else:
split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255
create_text(true_depth, depth, frame_width, frame_height)
combined_frame = cv2.hconcat([raw_frame, split_region, depth])
out.write(combined_frame)
frame_count += 1
raw_video.release()
out.release()
stop = timeit.default_timer()
print('Time (seconds): ', stop - start)
I ran the following command : python run_video.py --encoder vitl --load-from checkpoints/depth_anything_v2_metric_vkitti_vitl.pth --video-path ../assets/examples_video --outdir ./output-metrics
And here is an example frame in output :
My issue is : I don't understand the output values... I expected to have meters in output but it seems like decimeters ? Is it relevant ? Did I miss something ?
Hello,
I adapted the
run_video.py
you provide to integratemetric_depth
. Here is the code I use. I mainly add some logs, provide a way to specify an output fps and add text on output with depth output.I ran the following command :
python run_video.py --encoder vitl --load-from checkpoints/depth_anything_v2_metric_vkitti_vitl.pth --video-path ../assets/examples_video --outdir ./output-metrics
And here is an example frame in output :
My issue is : I don't understand the output values... I expected to have meters in output but it seems like decimeters ? Is it relevant ? Did I miss something ?