resultant pose is incorrect

Hi, I tried to replicate your simple_scene with the following settings, but the result pose seems to be wrong!

from right to left: rough pose, origin, predicted pose:

diff_pose_

diffpose.yaml

---
camera:
    fx: 608.7362060546875
    fy: 608.2891235351562
    cx: 313.5289611816406
    cy: 241.66563415527344
    im_width: 640
    im_height: 480
scene:
    path_img: data/example/scene/rgb.png
    path_depth:  data/example/scene/depth.png
    path_segmentation:  data/example/scene/seg.png
    image_resize: 1.0
object3d:        
    position: [-4.763, 186.02273, 880.0] # mm
    scale: 0.01 # ??
    # rotation can be a quaternion, matrix 3x3 flatten or not
    rotation: [0.76681062,  0.57029835,  0.29455266,0.49704141, -0.81793831,  0.28970114,0.40614199, -0.07574104, -0.91066568]
    model_path: data/example/mesh/cube_mesh.ply
losses:
    l1_rgb_with_mask: false
    weight_rgb: 0.7
    l1_depth_with_mask: false
    weight_depth: 1
    l1_mask: true
    weight_mask: 1
hyperparameters:
    nb_iterations: 60
    batchsize: 2
    base_lr: 20
    learning_rates_bound: [0.01, 100]
    learning_rate_base: 1
    lr_decay: 0.01
render_images:
    nrow: 4
    final_width_batch: 2000
    add_background: true
    alpha_overlay: 0.7
    add_countour: true
    color_countour: [0.46, 0.73, 0]
    flip_result: true
    crop_around_mask: true

Here are my mask, depth, rgb images:

Segmenation mask

seg

Depth Image

depth

RGB image

rgb

and here is the cube_mesh.ply which is a simple cuboid of dimensions 0.285 x 0.24 x 0.165 meters also tried with up-sampled version of the mesh cloud with no success!

cube_mesh.ply

ply
format ascii 1.0
element vertex 8
property float x
property float y
property float z
property float nx
property float ny
property float nz
property uchar red
property uchar green
property uchar blue
element face 12
property list uchar int vertex_indices
end_header
-142.5 -120.0 -82.5 -0.43058563037375547 -0.5113204360688346 -0.7437388161001229 0 0 0 
142.5 -120.0 -82.5 0.43058563037375547 -0.5113204360688346 -0.7437388161001229 0 0 0 
-142.5 -120.0 82.5 -0.43058563037375547 -0.5113204360688346 0.7437388161001229 0 0 0 
142.5 -120.0 82.5 0.43058563037375547 -0.5113204360688346 0.7437388161001229 0 0 0 
-142.5 120.0 -82.5 -0.43058563037375547 0.5113204360688346 -0.7437388161001229 0 0 0 
142.5 120.0 -82.5 0.43058563037375547 0.5113204360688346 -0.7437388161001229 0 0 0 
-142.5 120.0 82.5 -0.43058563037375547 0.5113204360688346 0.7437388161001229 0 0 0 
142.5 120.0 82.5 0.43058563037375547 0.5113204360688346 0.7437388161001229 0 0 0 
3 4 7 5
3 4 6 7
3 0 2 4
3 2 6 4
3 0 1 2
3 1 3 2
3 1 5 7
3 1 7 3
3 2 3 7
3 2 7 6
3 0 4 1
3 1 4 5

I use this script to visualize the results (I divided the translation by 10 to make the refined pose error visible):

import cv2
import numpy as np
import open3d as o3d

def create_frame(
    translation:np.ndarray=np.zeros((3, 1), dtype=np.float32), 
    orientation:np.ndarray=np.eye(3, dtype=np.float32), 
    frame_size=0.1
)-> o3d.geometry.TriangleMesh:
    frame_pose = o3d.geometry.TriangleMesh.create_coordinate_frame(
            size=frame_size, 
            origin=translation
        )
    frame_pose.rotate(orientation)
    return frame_pose

if __name__=="__main__":
    rough_pose = np.array([
        [0.76681062,  0.57029835,  0.29455266, -0.004763   ],
        [0.49704141, -0.81793831,  0.28970114,  0.18602273 ],
        [0.40614199, -0.07574104, -0.91066568,  0.88       ],
        [0.        ,  0.        ,  0.        ,  1.         ]
    ])
    #
    refined_pose = np.array(
        [[ 0.826493  ,  0.45631468,  0.3296759 , -0.33387086],
        [-0.56285906,  0.6594877 ,  0.49826264, -1.5749068 ],
        [ 0.00994734, -0.5973717 ,  0.80190283, -8.098417  ],
        [ 0.        ,  0.        ,  0.        ,  1.        ]]
        )

    # refined_pose[:3, 3] /=10

    origin = create_frame()
    #
    pose_rough = create_frame(
        translation=rough_pose[:3, 3],
        orientation=rough_pose[:3, :3]
    )
    #
    pose_refined = create_frame(
        translation=refined_pose[:3, 3],
        orientation=refined_pose[:3, :3]
    )
    #
    pcl_io = o3d.io.read_point_cloud("/data/example/mesh/cube_mesh.ply")
    pcl_io.scale(0.001, [0.0, 0.0, 0.0])
    #
    depth = cv2.imread("/data/example/scene/depth.png", -1)
    mask = cv2.imread("/data/example/scene/seg.png", 0)
    masked_depth = cv2.bitwise_and(depth, depth, mask=mask)

    depth_3d = o3d.geometry.Image(
        np.ascontiguousarray(masked_depth)
    )
    intrinsics = o3d.camera.PinholeCameraIntrinsic(
        640,
        480,
        608.7362060546875,
        608.2891235351562,
        313.5289611816406,
        241.66563415527344
    )
    box_pcl = o3d.geometry.PointCloud.create_from_depth_image(
        depth_3d, intrinsics
    )
    box_pcl.estimate_normals()
    #
    o3d.visualization.draw_geometries([origin, pose_rough, pose_refined, pcl_io, box_pcl])

NVlabs / diff-dope