DepthAnything / Depth-Anything-V2

[NeurIPS 2024] Depth Anything V2. A More Capable Foundation Model for Monocular Depth Estimation
https://depth-anything-v2.github.io
Apache License 2.0
3.94k stars 342 forks source link

3d point cloud from a video #140

Open gigasurgeon opened 3 months ago

gigasurgeon commented 3 months ago

Hi, thanks for your great work. I have created depth and normal map for all the frames for my video. I want to create a 3d reconstruction from this video. Your depth_to_normal_map.py only takes an image as an input. I was wondering if it would make sense to use of all the frames of the video for more accurate 3d reconstruction. How can I proceed to do this?

SlamMate commented 3 months ago

I modified the code and found that the model is not so good at video depth estimation. Below is the Result(I even change the fx and fy to the original TUM dataset or hypersim dataset. It's the same situation): Screenshot from 2024-08-19 11-04-14 The code is:

import argparse
import cv2
import glob
import numpy as np
import open3d as o3d
import os
from PIL import Image
import torch
from scipy.spatial.transform import Rotation as R

from depth_anything_v2.dpt import DepthAnythingV2

def parse_groundtruth(gt_file):
    poses = {}
    with open(gt_file, 'r') as f:
        for line in f:
            if line.startswith('#') or not line.strip():
                continue
            parts = line.strip().split()
            timestamp = float(parts[0])
            tx, ty, tz = map(float, parts[1:4])
            qx, qy, qz, qw = map(float, parts[4:])
            poses[timestamp] = {'translation': np.array([tx, ty, tz]),
                                'quaternion': np.array([qx, qy, qz, qw])}
    return poses

def find_closest_timestamp(timestamp, poses):
    """Find the closest timestamp in the poses dictionary."""
    closest_timestamp = min(poses.keys(), key=lambda t: abs(t - timestamp))
    return closest_timestamp, poses[closest_timestamp]

def main():
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description='Generate depth maps and point clouds from images.')
    parser.add_argument('--encoder', default='vitl', type=str, choices=['vits', 'vitb', 'vitl', 'vitg'],
                        help='Model encoder to use.')
    parser.add_argument('--load-from', default='', type=str, required=True,
                        help='Path to the pre-trained model weights.')
    parser.add_argument('--max-depth', default=20, type=float,
                        help='Maximum depth value for the depth map.')
    parser.add_argument('--img-path', type=str, required=True,
                        help='Path to the input image or directory containing images.')
    parser.add_argument('--outdir', type=str, default='./vis_pointcloud',
                        help='Directory to save the output point clouds.')
    parser.add_argument('--focal-length-x', default=535.4, type=float,
                        help='Focal length along the x-axis.')
    parser.add_argument('--focal-length-y', default=539.2, type=float,
                        help='Focal length along the y-axis.')
    parser.add_argument('--voxel-size', default=0.5, type=float,
                        help='Voxel size for downsampling the point cloud.')
    parser.add_argument('--gt-pose', type=str, required=True,
                        help='Path to the groundtruth.txt file.')

    args = parser.parse_args()

    # Determine the device to use (CUDA, MPS, or CPU)
    DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

    # Model configuration based on the chosen encoder
    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }

    # Initialize the DepthAnythingV2 model with the specified configuration
    depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})
    depth_anything.load_state_dict(torch.load(args.load_from, map_location='cpu'))
    depth_anything = depth_anything.to(DEVICE).eval()

    # Get the list of image files to process
    if os.path.isfile(args.img_path):
        if args.img_path.endswith('txt'):
            with open(args.img_path, 'r') as f:
                filenames = f.read().splitlines()
        else:
            filenames = [args.img_path]
    else:
        filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)

    # Parse the groundtruth poses
    poses = parse_groundtruth(args.gt_pose)

    # Create the output directory if it doesn't exist
    os.makedirs(args.outdir, exist_ok=True)

    # Initialize a combined point cloud
    combined_pcd = o3d.geometry.PointCloud()

    # Process each image file
    for k, filename in enumerate(filenames):
        print(f'Processing {k+1}/{len(filenames)}: {filename}')

        # Load the image
        color_image = Image.open(filename).convert('RGB')
        width, height = color_image.size

        # Read the image using OpenCV
        image = cv2.imread(filename)
        resized_image = cv2.resize(image, (1024, 768))
        # width1, height1 = resized_image.shape[1], resized_image.shape[0]
        pred, feature = depth_anything.infer_image(resized_image, height)

        # Resize depth prediction to match the original image size
        resized_pred = Image.fromarray(pred).resize((width, height), Image.NEAREST)

        # Generate mesh grid and calculate point cloud coordinates
        x, y = np.meshgrid(np.arange(width), np.arange(height))
        x = (x - width / 2) / args.focal_length_x
        y = (y - height / 2) / args.focal_length_y
        z = np.array(resized_pred)
        points = np.stack((np.multiply(x, z), np.multiply(y, z), z), axis=-1).reshape(-1, 3)
        colors = np.array(color_image).reshape(-1, 3) / 255.0

        # Retrieve the pose corresponding to the timestamp (assuming filename contains the timestamp)
        timestamp = float(os.path.basename(filename).split('.')[0])
        if timestamp in poses:
            pose = poses[timestamp]
        else:
            closest_timestamp, pose = find_closest_timestamp(timestamp, poses)
            print(f"No exact pose information found for {timestamp}. Using closest timestamp: {closest_timestamp}")

        # Convert quaternion to rotation matrix
        rotation_matrix = R.from_quat(pose['quaternion']).as_matrix()

        # Apply the rotation and translation to the points
        transformed_points = points @ rotation_matrix.T + pose['translation']

        # Create the point cloud
        pcd = o3d.geometry.PointCloud()
        pcd.points = o3d.utility.Vector3dVector(transformed_points)
        pcd.colors = o3d.utility.Vector3dVector(colors)

        # Downsample the point cloud
        downsampled_pcd = pcd.voxel_down_sample(voxel_size=args.voxel_size)

        # Combine the downsampled point cloud with the combined point cloud
        combined_pcd += downsampled_pcd

    # Save the combined point cloud to the output directory
    combined_pcd_file = os.path.join(args.outdir, "combined_point_cloud.ply")
    o3d.io.write_point_cloud(combined_pcd_file, combined_pcd)
    print(f"Combined point cloud saved to {combined_pcd_file}")

if __name__ == '__main__':
    main()
SlamMate commented 3 months ago

More over, I tried disparity mode and got the same result. I guess the consistency is not good enough for multiview

petermg commented 3 months ago

More over, I tried disparity mode and got the same result. I guess the consistency is not good enough for multiview

Thank you for doing this. Very interesting. Would be nice if it would be actually consistent between frames :\