Closed swjTheDad closed 11 months ago
Hi,
May I ask which scene you used to produce this result, and is there a mininal sample I can reproduce it? The result looks very weird. For example, Pair Rot Error (Deg): 76.52 means the model does not learn anything, which might be even worse than random guess. And usually, the rotation error should be smaller than the translation error, because the relative translation is dependent on the relative rotation. My first impression is there are some bugs somewhere.
Besides, could you check its behavior with more samples? Another suggestion would be also checking the result without GGS, because GGS is not included in the training phase.
The test scene used here is 'samples/apple' from the posediffusion demo. The previously trained model mentioned in the readme.md performs well in this scenario.
I only made a small modification to the demo's code. the modified version is:
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import argparse
import datetime
import glob
import os
import re
import time
from pathlib import Path
import numpy as np
import torch
from typing import Dict, List, Optional, Union
from omegaconf import OmegaConf, DictConfig
import hydra
from hydra.utils import instantiate, get_original_cwd
import models
import time
from functools import partial
from pytorch3d.renderer.cameras import PerspectiveCameras
from pytorch3d.ops import corresponding_cameras_alignment
from pytorch3d.implicitron.tools import model_io, vis_utils
from pytorch3d.vis.plotly_vis import plot_scene
from util.utils import seed_all_random_engines
from util.match_extraction import extract_match
from util.load_img_folder import load_and_preprocess_images
from util.geometry_guided_sampling import geometry_guided_sampling
from util.metric import camera_to_rel_deg, calculate_auc_np
#from util.metric import compute_ARE
from visdom import Visdom
@hydra.main(config_path="../cfgs/", config_name="default")
def demo(cfg: DictConfig) -> None:
OmegaConf.set_struct(cfg, False)
print("Model Config:")
print(OmegaConf.to_yaml(cfg))
# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Instantiate the model
model = instantiate(cfg.MODEL, _recursive_=False)
# Load and preprocess images
original_cwd = get_original_cwd() # Get original working directory
folder_path = os.path.join(original_cwd, cfg.image_folder)
images, image_info = load_and_preprocess_images(folder_path,None,cfg.image_size)
# Load checkpoint
ckpt_path = os.path.join(original_cwd, cfg.ckpt)
if os.path.isfile(ckpt_path):
checkpoint = torch.load(ckpt_path, map_location=device)
model.load_state_dict(checkpoint, strict=True)
print(f"Loaded checkpoint from: {ckpt_path}")
else:
raise ValueError(f"No checkpoint found at: {ckpt_path}")
# Move model and images to the GPU
model = model.to(device)
images = images.to(device)
# Evaluation Mode
model.eval()
# Seed random engines
seed_all_random_engines(cfg.seed)
# Start the timer
start_time = time.time()
# Perform match extraction
if cfg.GGS.enable:
# Optional TODO: remove the keypoints outside the cropped region?
kp1, kp2, i12 = extract_match(None,folder_path, image_info)
if kp1 is not None:
keys = ["kp1", "kp2", "i12", "img_shape"]
values = [kp1, kp2, i12, images.shape]
matches_dict = dict(zip(keys, values))
cfg.GGS.pose_encoding_type = cfg.MODEL.pose_encoding_type
GGS_cfg = OmegaConf.to_container(cfg.GGS)
cond_fn = partial(geometry_guided_sampling, matches_dict=matches_dict, GGS_cfg=GGS_cfg)
print("[92m=====> Sampling with GGS <=====[0m")
else:
cond_fn = None
else:
cond_fn = None
print("[92m=====> Sampling without GGS <=====[0m")
images = images.unsqueeze(0)
# Forward
with torch.no_grad():
# Obtain predicted camera parameters
# pred_cameras is a PerspectiveCameras object with attributes
# pred_cameras.R, pred_cameras.T, pred_cameras.focal_length
# The poses and focal length are defined as
# NDC coordinate system in
# https://github.com/facebookresearch/pytorch3d/blob/main/docs/notes/cameras.md
predictions = model(image=images, cond_fn=cond_fn, cond_start_step=cfg.GGS.start_step, training=False)
pred_cameras = predictions["pred_cameras"]
# Stop the timer and calculate elapsed time
end_time = time.time()
elapsed_time = end_time - start_time
print("Time taken: {:.4f} seconds".format(elapsed_time))
'''
# Visualization
try:
viz = Visdom()
cams_show = {"ours_pred": pred_cameras, "ours_pred_aligned": pred_cameras_aligned, "gt_cameras": gt_cameras}
fig = plot_scene({f"{folder_path}": cams_show})
viz.plotlyplot(fig, env="visual", win="cams")
except:
print("Please check your visdom connection")
'''
# Compute metrics if gt is available
# Load gt poses
if os.path.exists(os.path.join(folder_path, "gt_cameras.npz")):
gt_cameras_dict = np.load(os.path.join(folder_path, "gt_cameras.npz"))
gt_cameras = PerspectiveCameras(
focal_length=gt_cameras_dict["gtFL"], R=gt_cameras_dict["gtR"], T=gt_cameras_dict["gtT"], device=device
)
# 7dof alignment, using Umeyama's algorithm
pred_cameras_aligned = corresponding_cameras_alignment(
cameras_src=pred_cameras, cameras_tgt=gt_cameras, estimate_scale=True, mode="extrinsics", eps=1e-9
)
print(pred_cameras_aligned)
print(gt_cameras)
with open('output.txt', 'w') as f:
f.write(str(pred_cameras_aligned) + '\n')
f.write(str(gt_cameras) + '\n')
# Compute the absolute rotation error
batch_size = 1
print(batch_size)
rel_rangle_deg, rel_tangle_deg = camera_to_rel_deg(pred_cameras, gt_cameras, device, batch_size)
print(f" -- Pair Rot Error (Deg): {rel_rangle_deg.mean():10.2f}")
print(f" -- Pair Trans Error (Deg): {rel_tangle_deg.mean():10.2f}")
#ARE = compute_ARE(pred_cameras_aligned.R, gt_cameras.R).mean()
#print(f"For {folder_path}: the absolute rotation error is {ARE:.6f} degrees.")
else:
print(f"No GT provided. No evaluation conducted.")
if __name__ == "__main__":
demo()
Can you also share your the model trained by yourself (the one achieved Pair Rot Error (Deg): 76.52)?
Also, it would be beneficial to share how many steps have this model been trained and how many gpus do you use.
Sure, here the link 链接:https://pan.baidu.com/s/1sO-TejXdl9ZQvaqW_aldhw 提取码:3q6w
The model is only trained by 5 epochs, but the loss has already decreased very slowly. I used a max_image of 512 and trained on an A100.
Great. Can you also check the scene name for the sample (Pair Rot Error (Deg): 0.67 Pair Trans Error (Deg): 0.86)? The scene name can be found in the output log when testing.
Sure.
/home/evsjtu/swj/PoseDiffusion-main/pose_diffusion/test.py:38: UserWarning:
The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
@hydra.main(config_path="../cfgs/", config_name="default_test")
/home/evsjtu/anaconda3/envs/posediffusion/lib/python3.9/site-packages/hydra/_internal/hydra.py:119: UserWarning: Future Hydra versions will no longer change working directory at job runtime by default.
See https://hydra.cc/docs/1.2/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
ret = run_job(
Model Config: seed: 0
test:
CO3D_DIR: /home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown
CO3D_ANNOTATION_DIR: /home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple
resume_ckpt: /home/evsjtu/swj/PoseDiffusion-main/pytorch_model.pth
random_order: true
num_frames: 10
img_size: 224
category: seen
normalize_cameras: true
persistent_workers: true
preload_image: false
cudnnbenchmark: false
first_camera_transform: true
min_num_images: 50
compute_optical: true
GGS:
enable: true
start_step: 10
learning_rate: 0.01
iter_num: 100
sampson_max: 10
min_matches: 10
alpha: 0.0001
debug: false
MODEL:
_target_: models.PoseDiffusionModel
pose_encoding_type: absT_quaR_logFL
IMAGE_FEATURE_EXTRACTOR:
_target_: models.MultiScaleImageFeatureExtractor
modelname: dino_vits16
freeze: false
DENOISER:
_target_: models.Denoiser
TRANSFORMER:
_target_: models.TransformerEncoderWrapper
d_model: 512
nhead: 4
dim_feedforward: 1024
num_encoder_layers: 8
dropout: 0.1
batch_first: true
norm_first: true
DIFFUSER:
_target_: models.GaussianDiffusion
beta_schedule: custom
Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Mixed precision type: no
----------Seed is set to 0 now----------
Using cache found in /home/evsjtu/.cache/torch/hub/facebookresearch_dino_main
Successfully resumed from /home/evsjtu/swj/PoseDiffusion-main/pytorch_model.pth
----------------------------------------------------------------------------------------------------
Testing on ['apple', 'backpack', 'banana', 'baseballbat', 'baseballglove', 'bench', 'bicycle', 'bottle', 'bowl', 'broccoli', 'cake', 'car', 'carrot', 'cellphone', 'chair', 'cup', 'donut', 'hairdryer', 'handbag', 'hydrant', 'keyboard', 'laptop', 'microwave', 'motorcycle', 'mouse', 'orange', 'parkingmeter', 'pizza', 'plant', 'stopsign', 'teddybear', 'toaster', 'toilet', 'toybus', 'toyplane', 'toytrain', 'toytruck', 'tv', 'umbrella', 'vase', 'wineglass']
----------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------
Category apple Start
******************************** test on apple ********************************
CO3D_DIR is /home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown
/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/apple_test.jgz
1
Low quality translation sequences, not used: []
Data size: 1
Testing the sequence 189_20393_38136 of category apple
['/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000008.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000037.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000057.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000110.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000116.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000149.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000156.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000177.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000189.jpg', '/home/evsjtu/swj/PoseDiffusion-main/relpose-plus-plus/co3ddown/apple/189_20393_38136/images/frame000196.jpg']
[2023/12/18 19:14:05 hloc INFO] Extracting local features with configuration:
{'model': {'max_keypoints': 4096, 'name': 'superpoint', 'nms_radius': 4},
'output': 'feats-superpoint-n4096-r1600',
'preprocessing': {'grayscale': True, 'resize_max': 1600}}
Loaded SuperPoint model
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00, 3.02it/s]
[2023/12/18 19:14:08 hloc INFO] Finished exporting features.
[2023/12/18 19:14:08 hloc INFO] Found 45 pairs.
[2023/12/18 19:14:08 hloc INFO] Matching local features with configuration:
{'model': {'name': 'superglue',
'sinkhorn_iterations': 50,
'weights': 'outdoor'},
'output': 'matches-superglue'}
Loaded SuperGlue model ("outdoor" weights)
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:03<00:00, 12.77it/s]
[2023/12/18 19:14:12 hloc INFO] Finished exporting matches.
[2023/12/18 19:14:12 hloc INFO] Creating an empty database...
[2023/12/18 19:14:12 hloc INFO] Importing images into the database...
[2023/12/18 19:14:12 hloc INFO] Importing features into the database...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1442.43it/s]
[2023/12/18 19:14:12 hloc INFO] Importing matches into the database...
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 1168.87it/s]
[2023/12/18 19:14:12 hloc INFO] Performing geometric verification of the matches...
=====> Sampling with GGS <=====
/home/evsjtu/anaconda3/envs/posediffusion/lib/python3.9/site-packages/torch/functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /opt/conda/conda-bld/pytorch_1666643016022/work/aten/src/ATen/native/TensorShape.cpp:3190.)
return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]
t=09 | sampson=9.013452
t=09 | sampson=9.001946
t=09 | sampson=8.975619
t=09 | sampson=8.953597
t=09 | sampson=8.897875
t=08 | sampson=8.847275
t=08 | sampson=8.837044
t=08 | sampson=8.818635
t=08 | sampson=8.816516
t=08 | sampson=7.834114
t=07 | sampson=7.766330
t=07 | sampson=7.785002
t=07 | sampson=7.734869
t=07 | sampson=7.732924
t=07 | sampson=7.691217
t=06 | sampson=6.527886
t=06 | sampson=6.533252
t=06 | sampson=6.526037
t=06 | sampson=6.519101
t=06 | sampson=6.614461
t=05 | sampson=6.867362
t=05 | sampson=6.870676
t=05 | sampson=6.865692
t=05 | sampson=6.863625
t=05 | sampson=6.866094
t=04 | sampson=6.852931
t=04 | sampson=6.856085
t=04 | sampson=6.856703
t=04 | sampson=6.849905
t=04 | sampson=6.928750
t=03 | sampson=2.407641
t=03 | sampson=2.388970
t=03 | sampson=2.366150
t=03 | sampson=2.361079
t=03 | sampson=2.315772
t=02 | sampson=0.679744
t=02 | sampson=0.677436
t=02 | sampson=0.674992
t=02 | sampson=0.670166
t=02 | sampson=0.677690
t=01 | sampson=0.681357
t=01 | sampson=0.675905
t=01 | sampson=0.676229
t=01 | sampson=0.670328
t=01 | sampson=0.680273
t=00 | sampson=0.679816
t=00 | sampson=0.674578
t=00 | sampson=0.674158
t=00 | sampson=0.669678
t=00 | sampson=0.678452
-- Pair Rot Error (Deg): 0.67
-- Pair Trans Error (Deg): 0.86
----------------------------------------------------------------------------------------------------
Category apple Done
Hi @swjTheDad ,
I visually checked the scene 189_20393_38136 and the samples/apple. They are two different scenes. It is quite likely that the model shared here overfits to certain patterns and hence can work quite well on specific scenes but poor on others, especially when you train on one gpu instead of eight.
Sure, here the link 链接:https://pan.baidu.com/s/1sO-TejXdl9ZQvaqW_aldhw 提取码:3q6w
samples/apple:
189_20393_38136:
I tested your model on more scenes and it looks can only work on a part of scenes. I am not sure if you only train it on "189_20393_38136", but it seems quite overfitted.
I sincerely apologize for the oversight, as I downloaded the dataset as the option of single_sequence_subset. Thank you very much for the timely response.
I have a bit of confusion while reproducing the project as per the readme.md: During the training process, I tested the intermediate output pytorch_model.bin with test.py and found the results to be very good. However, the Sampson error decrease is concentrated around t=02-00 and t=09. The following are the sampson error values: [Various sampson error values are listed from t=09 to t=00 with corresponding values] t=09 | sampson=9.013452 t=09 | sampson=9.001946 t=09 | sampson=8.975619 t=09 | sampson=8.953597 t=09 | sampson=8.897875 t=08 | sampson=8.847275 t=08 | sampson=8.837044 t=08 | sampson=8.818635 t=08 | sampson=8.816516 t=08 | sampson=7.834114 t=07 | sampson=7.766330 t=07 | sampson=7.785002 t=07 | sampson=7.734869 t=07 | sampson=7.732924 t=07 | sampson=7.691217 t=06 | sampson=6.527886 t=06 | sampson=6.533252 t=06 | sampson=6.526037 t=06 | sampson=6.519101 t=06 | sampson=6.614461 t=05 | sampson=6.867362 t=05 | sampson=6.870676 t=05 | sampson=6.865692 t=05 | sampson=6.863625 t=05 | sampson=6.866094 t=04 | sampson=6.852931 t=04 | sampson=6.856085 t=04 | sampson=6.856703 t=04 | sampson=6.849905 t=04 | sampson=6.928750 t=03 | sampson=2.407641 t=03 | sampson=2.388970 t=03 | sampson=2.366150 t=03 | sampson=2.361079 t=03 | sampson=2.315772 t=02 | sampson=0.679744 t=02 | sampson=0.677436 t=02 | sampson=0.674992 t=02 | sampson=0.670166 t=02 | sampson=0.677690 t=01 | sampson=0.681357 t=01 | sampson=0.675905 t=01 | sampson=0.676229 t=01 | sampson=0.670328 t=01 | sampson=0.680273 t=00 | sampson=0.679816 t=00 | sampson=0.674578 t=00 | sampson=0.674158 t=00 | sampson=0.669678 t=00 | sampson=0.678452 -- Pair Rot Error (Deg): 0.67 -- Pair Trans Error (Deg): 0.86 But when I tested it again with demo.py, the results were very poor, as shown by the following sampson error values: [Various sampson error values are listed from t=09 to t=00 with corresponding values] t=09 | sampson=9.816686 t=09 | sampson=9.817419 t=09 | sampson=9.808633 t=09 | sampson=9.807852 t=09 | sampson=9.800014 t=08 | sampson=9.599756 t=08 | sampson=9.599182 t=08 | sampson=9.598705 t=08 | sampson=9.596356 t=08 | sampson=9.584873 t=07 | sampson=9.477237 t=07 | sampson=9.476614 t=07 | sampson=9.474885 t=07 | sampson=9.452090 t=07 | sampson=9.368668 t=06 | sampson=9.295912 t=06 | sampson=9.297716 t=06 | sampson=9.277538 t=06 | sampson=9.269544 t=06 | sampson=9.193401 t=05 | sampson=9.116795 t=05 | sampson=9.115306 t=05 | sampson=9.086121 t=05 | sampson=9.043097 t=05 | sampson=8.990879 t=04 | sampson=8.676836 t=04 | sampson=8.678219 t=04 | sampson=8.596172 t=04 | sampson=8.562906 t=04 | sampson=8.441508 t=03 | sampson=7.771114 t=03 | sampson=7.769285 t=03 | sampson=7.670792 t=03 | sampson=7.694279 t=03 | sampson=7.510387 t=02 | sampson=7.150699 t=02 | sampson=7.150811 t=02 | sampson=7.095228 t=02 | sampson=7.017509 t=02 | sampson=6.951574 t=01 | sampson=6.954993 t=01 | sampson=6.955435 t=01 | sampson=6.932558 t=01 | sampson=6.922458 t=01 | sampson=6.789901 t=00 | sampson=6.529342 t=00 | sampson=6.541269 t=00 | sampson=6.386536 t=00 | sampson=6.355147 t=00 | sampson=6.179797 Time taken: 97.6004 seconds PerspectiveCameras() PerspectiveCameras() 1 -- Pair Rot Error (Deg): 76.52 -- Pair Trans Error (Deg): 45.81 The time taken was 97.6004 seconds, and the PerspectiveCameras() function was called. The pair rotation error was 76.52 degrees and the pair translation error was 45.81 degrees. I'm wondering if there might be an issue with the preprocessing of the co3d dataset that caused a mix-up between the training and test sets.