TencentARC / ViT-Lens

[CVPR 2024] ViT-Lens: Towards Omni-modal Representations
https://ailab-cvc.github.io/seed/vitlens/
Other
155 stars 10 forks source link

Reproducing NYUv2 Results #9

Closed jbrownkramer closed 7 months ago

jbrownkramer commented 7 months ago

This code documents the processing pipeline well, but it starts with disparity images, whereas the NYUv2 starts with depth images. What baseline and focal length are you using for converting NYUv2.D to disparity? My best guess is

f = 518.857901 b = 75

However, that seems like it could be off by an order of magnitude. Help would be appreciated.

StanLei52 commented 7 months ago

Hi, please see the attached code snippet for converting depth to disparity (along with checking the data). Please make sure that you've downloaded the same data as ours. Also use the depths in depth_bfx folders as inputs for converting.

def get_sensor_type_baseline(path):
    if "kv1" in path:
        return 0.075
    elif "kv2" in path:
        return 0.075
    elif "realsense" in path:
        return 0.095
    elif "xtion" in path:
        return 0.095
    else:
        raise NotFoundErr

def convert_depth_to_disparity(depth_file, intrinsics_file, min_depth=0.01, max_depth=50):
    """
    depth_file is a png file that contains the scene depth
    intrinsics_file is a txt file supplied in SUNRGBD with sensor information
            Can be found at the path: os.path.join(root_dir, room_name, "intrinsics.txt")
    """
    with open(intrinsics_file, 'r') as fh:
        lines = fh.readlines()
        focal_length = float(lines[0].strip().split()[0])
    baseline = get_sensor_type_baseline(depth_file)
    depth_image = np.array(Image.open(depth_file))
    depth = np.array(depth_image).astype(np.float32)
    depth_in_meters = depth / 1000.
    if min_depth is not None:
        depth_in_meters = depth_in_meters.clip(min=min_depth, max=max_depth)
    disparity = baseline * focal_length / depth_in_meters
    return torch.from_numpy(disparity).float()

def check_sun_rgbd():
    keep_labels = ['bathroom', 'bedroom', 'classroom', 'computer_room',
       'conference_room', 'corridor', 'dining_area', 'dining_room',
       'discussion_area', 'furniture_store', 'home_office', 'kitchen',
       'lab', 'lecture_theatre', 'library', 'living_room', 'office',
       'rest_space', 'study_space']

    mat = scipy.io.loadmat("/pathto/dataset/SUNRGBD/SUNRGBDtoolbox/traintestSUNRGBD/allsplit.mat")
    anno_test = mat["alltest"][0]
    test_scene_set = set()
    n_keep = 0
    n_nyu = 0
    test_meta = []
    for i in tqdm(range(len(anno_test)), total=len(anno_test), desc="sun-rgbd-Val"):
        path = str(anno_test[i][0])
        path = path.replace("/n/fs/sun3d/data/SUNRGBD", "/pathto/dataset/SUNRGBD")
        scene_lbl_path = os.path.join(path, "scene.txt")
        intrinsic_path = os.path.join(path, "intrinsics.txt")
        img_dir = os.path.join(path, "image")
        depth_dir = os.path.join(path, "depth_bfx")
        img_fn = os.listdir(img_dir)[0]
        depth_fn = os.listdir(depth_dir)
        depth_fn = [i for i in depth_fn if i.endswith(".png")][0]
        depth_path = os.path.join(depth_dir, depth_fn)
        disparity_path = depth_path.replace(".png", "_disparity.pt")

        disparity = convert_depth_to_disparity(depth_path, intrinsic_path)
        torch.save(disparity, disparity_path)

        with open(scene_lbl_path, "r") as f:
            scene_lbl = f.readlines()[0]

        if scene_lbl in keep_labels:
            n_keep += 1
            # follow ImageBind to only use those in `keep_labels`
            test_meta.append(
                {
                    "image_path": os.path.join(img_dir, img_fn),
                    "depth_path": depth_path,
                    "disparity_path": disparity_path,
                    "label": scene_lbl,
                    "cleaned_label": scene_lbl.replace("_", " ")
                }
            )

        if "NYU" in scene_lbl_path:
            n_nyu += 1

        test_scene_set.add(scene_lbl)

    print("====== Test info : ")
    print(test_scene_set, len(test_scene_set), n_keep, n_nyu, len(anno_test), len(test_meta))
    with open("/pathto/code/open_clip/src/open_clip/modal_depth/data/SUN-RGBD_val.json", "w") as f:
        json.dump(test_meta, f, indent=2)

    anno_train = mat["alltrain"][0]
    train_scene_set = set()
    n_keep = 0
    n_nyu = 0
    train_meta = []
    for i in tqdm(range(len(anno_train)), total=len(anno_train), desc="sun-rgbd-Train"):
        path = str(anno_train[i][0])
        path = path.replace("/n/fs/sun3d/data/SUNRGBD", "/pathto/dataset/SUNRGBD")
        scene_lbl_path = os.path.join(path, "scene.txt")
        intrinsic_path = os.path.join(path, "intrinsics.txt")
        img_dir = os.path.join(path, "image")
        depth_dir = os.path.join(path, "depth_bfx")
        img_fn = os.listdir(img_dir)[0]
        depth_fn = os.listdir(depth_dir)
        depth_fn = [i for i in depth_fn if i.endswith(".png")][0]
        depth_path = os.path.join(depth_dir, depth_fn)
        disparity_path = depth_path.replace(".png", "_disparity.pt")

        disparity = convert_depth_to_disparity(depth_path, intrinsic_path)
        torch.save(disparity, disparity_path)

        with open(scene_lbl_path, "r") as f:
            scene_lbl = f.readlines()[0]
        if scene_lbl in keep_labels:
            n_keep += 1
        if "NYU" in scene_lbl_path:
            n_nyu += 1
        train_scene_set.add(scene_lbl)
        train_meta.append(
            {
                "image_path": os.path.join(img_dir, img_fn),
                "depth_path": depth_path,
                "disparity_path": disparity_path,
                "label": scene_lbl,
                "cleaned_label": scene_lbl.replace("_", " ")
            }            
        )

    print("====== Train info:")
    print(train_scene_set, len(train_scene_set), n_keep, n_nyu, len(anno_train), len(train_meta))
    with open("/pathto/code/open_clip/src/open_clip/modal_depth/data/SUN-RGBD_train.json", "w") as f:
        json.dump(train_meta, f, indent=2)

def check_nyu_rgbd():
    # for dataset: rgb path, depth path, label tag (possible prompt)
    official_scene_names = json.load(open("/pathto/code/open_clip/src/open_clip/modal_depth/data/nyu-depth-v2_scene_name.json","r"))
    keep_labels = [
        "bedroom", "kitchen", "living_room", "bathroom", "dining_room", "office", "home_office", "classroom", "bookstore",
    ]
    meta = []
    mat = scipy.io.loadmat("/pathto/dataset/SUNRGBD/SUNRGBDtoolbox/traintestSUNRGBD/allsplit.mat")
    anno_test = mat["alltest"][0]
    test_scene_set = set()
    n_nyu = 0
    for i in tqdm(range(len(anno_test)), total=len(anno_test)):
        path = str(anno_test[i][0])
        path = path.replace("/n/fs/sun3d/data/SUNRGBD", "/pathto/dataset/SUNRGBD")
        scene_lbl_path = os.path.join(path, "scene.txt")
        img_dir = os.path.join(path, "image")
        depth_dir = os.path.join(path, "depth_bfx")
        img_fn = os.listdir(img_dir)[0]
        depth_fn = os.listdir(depth_dir)[0]
        disparity_fn = depth_fn.replace(".png", "_disparity.pt")

        with open(scene_lbl_path, "r") as f:
            scene_lbl = f.readlines()[0]
        if  not "NYU" in scene_lbl_path:
            continue

        nyu_idx = int(img_fn[3:7])
        n_nyu += 1
        test_scene_set.add(scene_lbl)
        meta.append(
            {
                "image_path": os.path.join(img_dir, img_fn),
                "depth_path": os.path.join(depth_dir, depth_fn),
                "disparity_path": os.path.join(depth_dir, disparity_fn),
                "label": scene_lbl,
                "cleaned_label": scene_lbl.replace("_", " "),
                "benchmark_label": scene_lbl.replace("_", " ") if scene_lbl in keep_labels else "others",
                "official_label": official_scene_names[nyu_idx-1]
            }            
        )

    print("====== NYU Data info : ")
    print(test_scene_set, len(test_scene_set), n_nyu, len(anno_test), len(meta))
    with open("/pathto/code/open_clip/src/open_clip/modal_depth/data/NYU-Depth-v2_val.json", "w") as f:
        json.dump(meta, f, indent=2)

You may find the f and b from the code and the downloaded data. Hope that helps :)

jbrownkramer commented 7 months ago

This is amazing! Thank you!