save depth maps first, and then validate metrics on NYUv2, got rmse of 0.375, but rmse in paper is 0.27

I load the model "ZoeD_M12_N.pt", and run the general following code with little modification , to save metrics depth map first.

import torch
import os
import datetime

import numpy as np

from PIL import Image
from pathlib import Path
from torchvision import transforms

from zoedepth.models.builder import build_model
from zoedepth.utils.config import get_config

from zoedepth.data.preprocess import get_black_border

os.environ["CUDA_VISIBLE_DEVICES"] = '0'

def main():
    datas_dir = 'nyu_scenes_split/test'

    method = 'mde_zoedepth_eval'

    crop_black_or_white_border_ = False

    # conf = get_config("zoedepth", "infer")
    conf = get_config("zoedepth", "eval")
    model_zoe_n = build_model(conf)
    zoe = model_zoe_n.to('cuda')

    dataset_path = Path(datas_dir)
    count = 0
    for img_dir in dataset_path.rglob('*.jpg'):
        img_dir = str(img_dir)
        if 'rgb_' in img_dir:
            count += 1

            if crop_black_or_white_border_:
                rgb_pil = Image.open(img_dir).convert('RGB')
                w, h = rgb_pil.size
                crop_params = get_black_border(np.array(rgb_pil, dtype=np.uint8))
                rgb_pil = rgb_pil.crop((crop_params.left, crop_params.top, crop_params.right, crop_params.bottom))
                rgb_array = np.array(rgb_pil)
                rgb_array = np.pad(rgb_array, ((crop_params.top, h - crop_params.bottom), (crop_params.left, w - crop_params.right), (0, 0)), mode='reflect')
                rgb_pil = Image.fromarray(rgb_array)
            else:
                rgb_pil = Image.open(img_dir).convert('RGB')

            with torch.no_grad():
                mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")*1000
                # mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")

            mde_array = mde_array.astype(np.uint16)

            mde_pil = Image.fromarray(mde_array)
            save_dir = img_dir.replace('test', method)
            save_dir = save_dir.replace('jpg', 'png')
            save_dir = save_dir.replace('rgb', 'mde')
            img_name = save_dir.split('/')[-1]
            saves_dir = save_dir.replace(img_name, '')
            os.makedirs(saves_dir, exist_ok=True)
            mde_pil.save(save_dir)

            # save_dir = img_dir.replace('test', method)
            # save_dir = save_dir.replace('jpg', 'npy')
            # save_dir = save_dir.replace('rgb', 'mde')
            # img_name = save_dir.split('/')[-1]
            # saves_dir = save_dir.replace(img_name, '')
            # os.makedirs(saves_dir, exist_ok=True)
            # np.save(save_dir, mde_array)
        else:
            count += 0

    print(f'{method} has generated {count} mdes, saved to {saves_dir}')

if __name__ == '__main__':
    tm_begin = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    main()
    tm_end = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    print('tm_end: ', tm_end)

in this way, I will get the output metric depth in mm, and I test it by the following code, got bad rmse of 0.375 by the following code:

from PIL import Image
from pathlib import Path

import datetime
import numpy as np
import torch
import math
import torch.nn.functional as F

def gaussian(window_size, sigma):
    gauss = torch.Tensor([math.exp(-(x - window_size//2)**2/float(2*sigma**2)) for x in range(window_size)])
    return gauss/gauss.sum()
def create_window(window_size, channel=1):
    _1D_window = gaussian(window_size, 1.5).unsqueeze(1)
    _2D_window = _1D_window.mm(_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
    window = _2D_window.expand(channel, 1, window_size, window_size).contiguous()
    return window
def compute_ssim(pre_gsmaps, gt_gsmaps, window_size=11, size_average=True, full=False, val_range=None, window=None):
    if val_range is None:
        if torch.max(pre_gsmaps) > 128:
            max_val = 255
        else:
            max_val = 1
        if torch.min(pre_gsmaps) < -0.5:
            min_val = -1
        else:
            min_val = 0
        L = max_val - min_val
    else:
        L = val_range
    padd = 0
    (_, channel, height, width) = pre_gsmaps.size()
    if window is None:
        real_size = min(window_size, height, width)
        window = create_window(real_size, channel=channel).to(pre_gsmaps.device)

    mu1 = F.conv2d(pre_gsmaps, window, padding=padd, groups=channel)
    mu2 = F.conv2d(gt_gsmaps, window, padding=padd, groups=channel)

    mu1_sq = mu1.pow(2)
    mu2_sq = mu2.pow(2)
    mu1_mu2 = mu1 * mu2

    sigma1_sq = F.conv2d(pre_gsmaps * pre_gsmaps, window, padding=padd, groups=channel) - mu1_sq
    sigma2_sq = F.conv2d(gt_gsmaps * gt_gsmaps, window, padding=padd, groups=channel) - mu2_sq
    sigma12 = F.conv2d(pre_gsmaps * gt_gsmaps, window, padding=padd, groups=channel) - mu1_mu2

    C1 = (0.01 * L) ** 2
    C2 = (0.03 * L) ** 2

    v1 = 2.0 * sigma12 + C2
    v2 = sigma1_sq + sigma2_sq + C2
    cs = torch.mean(v1 / v2)  # contrast sensitivity

    ssim_map = ((2 * mu1_mu2 + C1) * v1) / ((mu1_sq + mu2_sq + C1) * v2)

    if size_average:
        ret = ssim_map.mean() # 无mean返回map
        # ret = ssim_map
    else:
        ret = ssim_map.mean(1).mean(1).mean(1)

    if full:
        return ret, cs
    return ret

def gen_1d_rand_pairs(gt_1d_array, point_num):
    if np.array(gt_1d_array.shape).shape != (1,):
        raise ValueError('array must be 1d')
    np.random.seed(int(gt_1d_array[0]))
    length = gt_1d_array.shape[0]
    pos = np.random.rand(2, point_num)
    pos *= np.array([[length], [length]])
    pos = np.floor(pos)
    return pos

def ranking_1d_eval(pre, gt, positions, t=0.03):
    idx1 = positions[0].astype(np.int64)
    idx2 = positions[1].astype(np.int64)

    z_1_pre = pre[idx1]
    z_2_pre = pre[idx2]
    rela_pre = z_1_pre/z_2_pre

    z_1_gt = gt[idx1]
    z_2_gt = gt[idx2]
    rela_gt = z_1_gt/z_2_gt

    mask1 = z_1_gt == 0
    mask2 = z_2_gt == 0
    mask = mask1 * mask2

    z_1_pre[mask == True] = 1
    z_2_pre[mask == True] = 1
    z_1_gt[mask == True] = 1
    z_2_gt[mask == True] = 1

    mask_list_pre = np.zeros_like(rela_pre)
    mask_list_pre[rela_pre>(1+t)] = 1
    mask_list_pre[rela_pre<(1-t)] = -1

    mask_list_gt = np.zeros_like(rela_pre)
    mask_list_gt[rela_gt>(1+t)] = 1
    mask_list_gt[rela_gt<(1-t)] = -1

    diff_mask = mask_list_gt - mask_list_pre
    wrong_points = np.count_nonzero(diff_mask)

    return wrong_points/len(rela_gt)

def compute_errors(gt, pred):

    rand_pairs = gen_1d_rand_pairs(gt, point_num=50000)
    oe = ranking_1d_eval(pred, gt, rand_pairs) * 100

    pre_tensor = torch.from_numpy(pred).unsqueeze(0).unsqueeze(0).unsqueeze(0)
    gt_tensor = torch.from_numpy(gt).unsqueeze(0).unsqueeze(0).unsqueeze(0)
    ssim = compute_ssim(pre_tensor, gt_tensor)

    ms = (gt - pred) ** 2
    psnr = 20 * np.log10(255 / np.sqrt(ms.mean()))

    mae = np.mean(np.abs(gt - pred))

    igt = 1 / gt
    ipred = 1 / pred

    iabsrel = np.mean(np.abs(igt - ipred) / igt)
    irmse = np.sqrt(np.mean((igt - ipred) ** 2))
    imae = np.mean(np.abs(igt - ipred))

    thresh = np.maximum((gt / pred), (pred / gt))
    d1 = (thresh < 1.25).mean()
    d2 = (thresh < 1.25 ** 2).mean()
    d3 = (thresh < 1.25 ** 3).mean()

    rmse = (gt - pred) ** 2
    rmse = np.sqrt(rmse.mean())

    logrmse = (np.log(gt) - np.log(pred)) ** 2
    logrmse = np.sqrt(logrmse.mean())

    absrel = np.mean(np.abs(gt - pred) / gt)
    sqrel = np.mean(((gt - pred) ** 2) / gt)

    err = np.log(pred) - np.log(gt)
    silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100

    err = np.abs(np.log10(pred) - np.log10(gt))
    log10 = np.mean(err)

    # return np.array([silog, absrel, log10, rmse, sqrel, logrmse, d1, d2, d3])
    return np.array([oe, psnr, ssim, absrel, rmse, mae, iabsrel, irmse, imae, d1*100, d2*100, d3*100])

def main():

    # method = 'mde_newcrfs'
    # method = 'mde_zoedepth'
    method = 'mde_zoedepth_eval'

    gts_path = Path('nyu_scenes_split/test')
    measures_sum = np.zeros(12)
    cnt = 0
    for gt_path in gts_path.rglob('*.png'):
        gt_dir = str(gt_path)

        mde_dir = gt_dir.replace('test', method)
        mde_dir = mde_dir.replace('sync_depth', 'mde')
        # mde_dir = mde_dir.replace('png', 'npy')

        mde_array = np.array(Image.open(mde_dir), dtype=np.float32)/1000
        # mde_array = np.load(mde_dir)
        gt_array = np.array(Image.open(gt_dir), dtype=np.float32)/1000

        # a = gt_array.shape
        # import cv2
        # mde_array = cv2.resize(mde_array, (gt_array.shape[1], gt_array.shape[0]), interpolation=cv2.INTER_CUBIC)

        mde_array[mde_array < 1e-3] = 1e-3
        mde_array[mde_array > 10] = 10
        mde_array[np.isinf(mde_array)] = 10
        mde_array[np.isnan(mde_array)] = 1e-3

        valid_mask = np.logical_and(gt_array > 1e-3, gt_array < 10)
        eval_mask = np.zeros(valid_mask.shape)
        eval_mask[45:471, 41:601] = 1
        valid_mask = np.logical_and(valid_mask, eval_mask)

        measures_sum += compute_errors(gt_array[valid_mask], mde_array[valid_mask])
        cnt += 1

    measures_sum /= cnt
    print('oe psnr ssim absrel rmse mae iabsrel irmse imae d1 d2 d3')
    print(f'{measures_sum[0]:.3f} {measures_sum[1]:.3f} {measures_sum[2]:.3f} {measures_sum[3]:.3f} {measures_sum[4]:.3f} {measures_sum[5]:.3f} {measures_sum[6]:.3f} {measures_sum[7]:.3f} {measures_sum[8]:.3f} {measures_sum[9]:.1f} {measures_sum[10]:.1f} {measures_sum[11]:.1f}')

if __name__ == '__main__':
    tm_begin = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    main()
    tm_end = datetime.datetime.now()
    print('tm_begin: ', tm_begin)
    print('tm_end: ', tm_end)

I compute the rmse in the same way for the method "NeWCRFs", and got the right rmse of 0.333. However, when I tried zoedepth, I got the rmse of 0.375. As you can see in my annotation, I have tried to crop the black border of RGB the same as Zoedepth, changed the mode of 'inter' into 'eval', and also tried to save the npy file to avoid round-off error, but none of them work. An interesting thing is that when I directly run the evaluate.py by the author, I got the right rmse of 0.27.

Directly utilize "mde_array = zoe.infer_pil(rgb_pil, output_type="numpy")" is so convenient to save depth map, but what makes the metrics down?

isl-org / ZoeDepth

save depth maps first, and then validate metrics on NYUv2, got rmse of 0.375, but rmse in paper is 0.27 #111