DepthAnything / Depth-Anything-V2

Depth Anything V2. A More Capable Foundation Model for Monocular Depth Estimation
https://depth-anything-v2.github.io
Apache License 2.0
3.28k stars 256 forks source link

The predicted depth maps are not consistent #112

Open YacineDeghaies opened 1 month ago

YacineDeghaies commented 1 month ago

After fine-tuning on my own dataset I then tried to predict depth maps and the model does not correctly predict the background. As you can see from the ground truth depth map the background is gray, yet in the predicted depth map, the background is black.

Is this the effect of the pre-trained model ?

ground truth depth map: gt

predicted depth map: scaled_one_from_Tuesday

HaosenZ commented 1 month ago

Hello, how do you use the fine-tuning model to predict depth maps? After I finished training, I replaced the model in run.py with latest.pth, but an error occurred. I would be extremely grateful if I could receive your reply.

YacineDeghaies commented 1 month ago

Hello, how do you use the fine-tuning model to predict depth maps? After I finished training, I replaced the model in run.py with latest.pth, but an error occurred. I would be extremely grateful if I could receive your reply.

here is my modified code for Depth-Anything-V2/metric_depth/run.py

import argparse
import cv2
import glob
import matplotlib
import numpy as np
import os
import torch
from depth_anything_v2.dpt import DepthAnythingV2

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Depth Anything V2 Metric Depth Estimation')

    parser.add_argument('--img-path', type=str)
    parser.add_argument('--input-size', type=int, default=518)
    parser.add_argument('--outdir', type=str, default='./vis_depth')
    parser.add_argument('--encoder', type=str, default='vitl', choices=['vits', 'vitb', 'vitl', 'vitg'])
    parser.add_argument('--load-from', type=str, default='checkpoints/depth_anything_v2_metric_hypersim_vitl.pth')
    parser.add_argument('--max-depth', type=float, default=80)
    parser.add_argument('--save-numpy', dest='save_numpy', action='store_true', help='save the model raw output')
    parser.add_argument('--pred-only', dest='pred_only', action='store_true', help='only display the prediction')
    parser.add_argument('--grayscale', dest='grayscale', action='store_true', help='do not apply colorful palette')

    args = parser.parse_args()

    DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

    model_configs = {
        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
        'vitg': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}
    }

    depth_anything = DepthAnythingV2(**{**model_configs[args.encoder], 'max_depth': args.max_depth})

    # Load checkpoint and adjust state_dict
    checkpoint = torch.load(args.load_from, map_location='cpu')
    my_state_dict = {}
    for key in checkpoint['model'].keys():
        new_key = key.replace('module.', '')  # Remove the 'module.' prefix if present
        my_state_dict[new_key] = checkpoint['model'][key]
    depth_anything.load_state_dict(my_state_dict, strict=False)

    depth_anything = depth_anything.to(DEVICE).eval()

    if os.path.isfile(args.img_path):
        if args.img_path.endswith('txt'):
            with open(args.img_path, 'r') as f:
                filenames = f.read().splitlines()
        else:
            filenames = [args.img_path]
    else:
        filenames = glob.glob(os.path.join(args.img_path, '**/*'), recursive=True)

    os.makedirs(args.outdir, exist_ok=True)

    cmap = matplotlib.colormaps.get_cmap('Spectral')

    for k, filename in enumerate(filenames):
        print(f'Progress {k+1}/{len(filenames)}: {filename}')

        raw_image = cv2.imread(filename, cv2.IMREAD_UNCHANGED)

        depth = depth_anything.infer_image(raw_image, args.input_size)

        if args.save_numpy:
            output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '_raw_depth_meter.npy')
            np.save(output_path, depth)

        depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0
        depth = depth.astype(np.uint8)

        output_path = os.path.join(args.outdir, os.path.splitext(os.path.basename(filename))[0] + '.png')
        if args.pred_only:
            if args.grayscale:
                # Save as single-channel grayscale image
                cv2.imwrite(output_path, depth, [cv2.IMWRITE_PNG_COMPRESSION, 0])
            else:
                # Apply colormap and save as color image
                depth_color = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
                cv2.imwrite(output_path, depth_color)
        else:
            if args.grayscale:
                # Convert raw_image to grayscale if it's color
                if len(raw_image.shape) == 3:
                    raw_image_gray = cv2.cvtColor(raw_image, cv2.COLOR_BGR2GRAY)
                else:
                    raw_image_gray = raw_image
                split_region = np.ones((raw_image.shape[0], 50), dtype=np.uint8) * 255
                combined_result = cv2.hconcat([raw_image_gray, split_region, depth])
            else:
                depth_color = (cmap(depth)[:, :, :3] * 255)[:, :, ::-1].astype(np.uint8)
                split_region = np.ones((raw_image.shape[0], 50, 3), dtype=np.uint8) * 255
                combined_result = cv2.hconcat([raw_image, split_region, depth_color])

            cv2.imwrite(output_path, combined_result)

        # Verify the saved image
        saved_image = cv2.imread(output_path, cv2.IMREAD_UNCHANGED)
        print(f"Saved image shape: {saved_image.shape}")
visonpon commented 3 weeks ago

@HaosenZ, my finetune results are more like yours, I have also tried midas_loss to train, the results also seem not so good. did you find some way? btw, the trick may in the dataloader part, we have to modify it based on our own datasets, but I haven't got it figured out .