DepthAnything / Depth-Anything-V2

[NeurIPS 2024] Depth Anything V2. A More Capable Foundation Model for Monocular Depth Estimation
https://depth-anything-v2.github.io
Apache License 2.0
3.86k stars 336 forks source link

Unit of measure for metrics depth ? #165

Open avanmalleghem opened 2 months ago

avanmalleghem commented 2 months ago

Hello,

I adapted the run_video.py you provide to integrate metric_depth. Here is the code I use. I mainly add some logs, provide a way to specify an output fps and add text on output with depth output.

import argparse
import cv2
import glob
import matplotlib
import numpy as np
import os
import torch
import timeit

from depth_anything_v2.dpt import DepthAnythingV2

def create_text(true_depth, image, frame_width, frame_height):
    steps = 20
    for y in range(1, steps):
        for x in range(1, steps):

            px_h = int(y * frame_height/steps)
            px_w = int(x * frame_width/steps)

            # Format the depth value (you can customize precision here)
            label_text = f'{true_depth[px_h, px_w]:.1f}'

            # Draw the text on the gradient image
            font = cv2.FONT_HERSHEY_SIMPLEX
            font_scale = 0.4
            font_thickness = 1
            text_color = (0, 0, 0)  # Black text
            text_position = (px_w, px_h)  # Position to the right of the color bar
            cv2.circle(image, (px_w, px_h), radius=5, color=(0, 0, 255), thickness=-1)
            cv2.putText(image, label_text, text_position, font, font_scale, text_color, font_thickness, cv2.LINE_AA)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation')

    parser.add_argument('--video-path', type=str)
    parser.add_argument('--input-size', type=int, default=518)
    parser.add_argument('--outdir', type=str, default='./vis_video_depth')

    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
    parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth')
    parser.add_argument('--max-depth', type=float, default=80)

    parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
    parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')

    args = parser.parse_args()

    DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

    print(DEVICE)

    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }

    depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
    depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
    depth_anything = depth_anything.to(DEVICE).eval()

    if os.path.isfile(args.video_path):
        if args.video_path.endswith('txt'):
            with open(args.video_path, 'r') as f:
                lines = f.read().splitlines()
        else:
            filenames = [args.video_path]
    else:
        filenames = glob.glob(os.path.join(args.video_path, '**/*'), recursive=True)

    os.makedirs(args.outdir, exist_ok=True)

    margin_width = 50
    cmap = matplotlib.colormaps.get_cmap('Spectral')

    for k, filename in enumerate(filenames):
        start = timeit.default_timer()
        print(f'Progress {k+1}/{len(filenames)}: {filename}')

        raw_video = cv2.VideoCapture(filename)
        frame_width, frame_height = int(raw_video.get(cv2.CAP_PROP_FRAME_WIDTH)), int(raw_video.get(cv2.CAP_PROP_FRAME_HEIGHT))
        frame_rate = int(raw_video.get(cv2.CAP_PROP_FPS))
        print(f"width : {frame_width} - height : {frame_height} - rate : {frame_rate}")

        if args.pred_only: 
            output_width = frame_width
        else: 
            output_width = frame_width * 2 + margin_width

        output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.mp4')
        out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (output_width, frame_height))

        desired_output_fps = 1.0
        frame_count = 0
        while raw_video.isOpened():
            ret, raw_frame = raw_video.read()
            if not ret:
                break

            if frame_count % (frame_rate / desired_output_fps) == 0:
                true_depth = depth_anything.infer_image(raw_frame, args.input_size)

            depth = (true_depth - true_depth.min()) / (true_depth.max() - true_depth.min()) * 255.0
            depth = depth.astype(np.uint8)

            if args.grayscale:
                depth = np.repeat(depth[..., np.newaxis], 3, axis=-1)
            else:
                depth = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)

            if args.pred_only:
                out.write(depth)
            else:
                split_region = np.ones((frame_height, margin_width, 3), dtype=np.uint8) * 255
                create_text(true_depth, depth, frame_width, frame_height)
                combined_frame = cv2.hconcat([raw_frame, split_region, depth])

                out.write(combined_frame)

            frame_count += 1

        raw_video.release()
        out.release()

        stop = timeit.default_timer()
        print('Time (seconds): ', stop - start)

I ran the following command : python run_video.py --encoder vitl --load-from checkpoints/depth_anything_v2_metric_vkitti_vitl.pth --video-path ../assets/examples_video --outdir ./output-metrics

And here is an example frame in output :

image

My issue is : I don't understand the output values... I expected to have meters in output but it seems like decimeters ? Is it relevant ? Did I miss something ?