Backprojecting 2D pixel to 3D coordinates w.r.t world frame

Backprojecting 2d pixels to 3d pose I implemented the following version (pinhole assumption as in the [code-piece]

def convert_pixel_to_pos(pixels, depth, camera_parameters, height, width):
    '''
    Args
        pixels: (n,2) numpy array with (x,y)
        depth: (h,w) depth map
        camera_parameters: camera.camera_parameters from omnigibson camera model
        height, width: depth.shape[:2]
    Return:
        3D pose w.r.t. the world co-oridnates
    '''
    # depth_samples = bilinear_interpolate(depth, x=pixels[:,0], y=pixels[:,1]).reshape(-1, 1)
    depth_samples = np.array([depth[int(p[1]), int(p[0])] for p in pixels]).reshape(-1, 1)
    # Convert pixels to normalized device coordinates
    pixels[:, 1] = height - pixels[:, 1]  # Flip y-axis
    pixels_ndc = 2.0 * pixels / np.array([[width, height]]) - 1  # Scale to [-1, 1] range
    # Value here should be:  1.4276962e-02 -5.1863410e-01
    # Scale the NDC by the depth (assuming the camera looks along the -z axis in NDC)
    pixels_depth_scaled = np.concatenate((pixels_ndc * depth_samples, depth_samples, np.ones_like(depth_samples)), axis=1)
    # Apply the inverse transformations manually
    view_proj =  camera_parameters["cameraViewTransform"].reshape(4,4) @ camera_parameters["cameraProjection"].reshape(4, 4)
    view_proj_inv = np.linalg.inv(view_proj)
    world_coords = pixels_depth_scaled @ view_proj_inv
    world_coords = world_coords[:, :3] / world_coords[:, 3:]
    return world_coords[:, :3]

Attaching the visualization of the output of the 3d positions (as an html file) for depth image, depth 3d point vis

However, the 3d positions calculated are different than the world frame, e.g., the point obj.get_position_orientation() is completely different? Moreover, the plane seems to be distorted; do you know why?

Hello! Thanks for reaching out. Here's some code from one of our teammates @cremebrule that does very similar things:

def compute_point_cloud_from_depth(depth, K, cam_to_img_tf=None, world_to_cam_tf=None, visualize_every=0, grid_limits=None):
    # K - 3x3 cam intrinsics matrix
    # tfs - 4x4 homogeneous global pose tf for cam
    # Camera points in -z, so rotate by 180 deg so it points correctly in +z -- this means
    # omni cam_to_img_tf is T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0])))
    h, w = depth.shape
    y, x = np.meshgrid(np.arange(h), np.arange(w), indexing="ij", sparse=False)
    assert depth.min() >= 0
    u = x
    v = y
    uv = np.dstack((u, v, np.ones_like(u)))

    Kinv = np.linalg.inv(K)

    pc = depth.reshape(-1, 1) * (uv.reshape(-1, 3) @ Kinv.T)
    pc = pc.reshape(h, w, 3)

    # If no tfs, use identity matrix
    cam_to_img_tf = np.eye(4) if cam_to_img_tf is None else cam_to_img_tf
    world_to_cam_tf = np.eye(4) if world_to_cam_tf is None else world_to_cam_tf

    pc = np.concatenate([pc.reshape(-1, 3), np.ones((h * w, 1))], axis=-1)  # shape (H*W, 4)

    # Convert using camera transform
    # Create (H * W, 4) vector from pc
    pc = (pc @ cam_to_img_tf.T @ world_to_cam_tf.T)[:, :3].reshape(h, w, 3)

    if visualize_every > 0:
        pc_flat = np.array(pc.reshape(-1, 3))
        pc_flat = np.where(np.linalg.norm(pc_flat, axis=-1, keepdims=True) > 1e4, 0.0, pc_flat)
        fig = plt.figure()
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(pc_flat[::visualize_every, 0], pc_flat[::visualize_every, 1], pc_flat[::visualize_every, 2], s=1)
        if grid_limits is not None:
            ax.set_xbound(lower=-grid_limits, upper=grid_limits)
            ax.set_ybound(lower=-grid_limits, upper=grid_limits)
            ax.set_zbound(lower=-grid_limits, upper=grid_limits)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Z')
        plt.show()

    return pc

Do you mind trying this function? You can derive K from the camera_parameters. Also, to help with distortion, you could try the depth_linear modality instead of depth. Please let me know if that helps!

Can you share how to do you calculate cam_to_img_tf and world_to_cam_tf? It is not directly provided in camera_parameters, it might introduce some differences.

I think you can try this: set cam_to_img_tf to T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0]))) and world_to_cam_tf to T.pose2mat(camera.get_position_orientation()) Let me know if that works!

Hi, the code you shared has an issue similar to the one I highlighted. The point cloud points of the entire scene look reasonable but are off from the global xyz axis. For example, the center point on the object received from grasp_obj.get_position() [-0.5000377 -1.0000356 0.44774088] is way off compared to the back-projected value [-2.41141153 -1.6718964 -1.32171841] calculated

Here's a minimally reproducing code based on the pointers you mentioned,

import os
import time
import math
import yaml
import cv2
import random
import copy
import numpy as np
import matplotlib.pyplot as plt
import open3d as o3d

import omnigibson as og
from omnigibson.action_primitives.starter_semantic_action_primitives import StarterSemanticActionPrimitiveSet, StarterSemanticActionPrimitives
import skills.skill_utils as SU
from skills.skill_classes import StarterSkillSet, StarterSkills

import omnigibson.utils.transform_utils as T

from omnigibson.macros import gm
gm.USE_GPU_DYNAMICS = True

def get_configs():
    """
    Demonstrates how to use the action primitives to pick and place an object in an empty scene.

    It loads Rs_int with a Fetch robot, and the robot picks and places a bottle of cologne.
    """
    # Load the config
    config_filename = os.path.join(og.example_config_path, "fetch_primitives.yaml")
    config = yaml.load(open(config_filename, "r"), Loader=yaml.FullLoader)
    # config['robots'][0]['type'] = 'Tiago'

    # Update it to create a custom environment and run some actions
    config["scene"]["scene_model"] = "Rs_int"
    # config["scene"]["load_object_categories"] = ["floors", "ceilings", "walls", "coffee_table"]
    config["scene"]["load_object_categories"] = ["floors", "coffee_table"]

    config["objects"] = [
        {
            "type": "DatasetObject",
            "name": "cologne",
            "category": "bottle_of_cologne",
            "model": "lyipur",
            "position": [-0.3, -0.8, 0.5],
            "orientation": [0, 0, 0, 1]
        },
        {
            "type": "DatasetObject",
            "name": "table",
            "category": "breakfast_table",
            "model": "rjgmmy",
            "scale": [0.3, 0.3, 0.3],
            "position": [-0.7, 0.5, 0.2],
            "orientation": [0, 0, 0, 1]
        }
    ]
    return config

def set_robot_position_orientation(pos, ori, robot, replay_steps=1):
    robot.set_position_orientation(pos, ori)
    for _ in range(replay_steps):
        og.sim.play()
        og.sim.step()
    return

def world_to_image_pinhole(world_points, camera_params):
    # Project corners to image space (assumes pinhole camera model)
    proj_mat = camera_params["cameraProjection"].reshape(4, 4)
    view_mat = camera_params["cameraViewTransform"].reshape(4, 4)
    view_proj_mat = view_mat @ proj_mat
    world_points_homo = np.pad(world_points, ((0, 0), (0, 1)), constant_values=1.0)
    tf_points = world_points_homo @ view_proj_mat
    tf_points = tf_points / (tf_points[..., -1:])
    return 0.5 * (tf_points[..., :2] + 1)

def get_pixel_from_pos(pos_3d, camera_parameters, height, width):
    pixels = world_to_image_pinhole(pos_3d, camera_parameters)
    pixels = pixels*np.array([[width, height]])
    # Flip the axis to the co-ordinate system (Maybe because obs-rgb is flipped?).
    pixels[:,1] = height - pixels[:,1]
    return pixels

def compute_point_cloud_from_depth(
        depth,
        K,
        cam_to_img_tf=None,
        world_to_cam_tf=None,
        visualize_every=0,
        grid_limits=None
    ):
    # K - 3x3 cam intrinsics matrix
    # tfs - 4x4 homogeneous global pose tf for cam
    # Camera points in -z, so rotate by 180 deg so it points correctly in +z -- this means
    # omni cam_to_img_tf is T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0])))
    h, w = depth.shape
    y, x = np.meshgrid(np.arange(h), np.arange(w), indexing="ij", sparse=False)
    assert depth.min() >= 0
    u = x
    v = y
    uv = np.dstack((u, v, np.ones_like(u)))

    Kinv = np.linalg.inv(K)

    pc = depth.reshape(-1, 1) * (uv.reshape(-1, 3) @ Kinv.T)
    pc = pc.reshape(h, w, 3)

    # If no tfs, use identity matrix
    cam_to_img_tf = np.eye(4) if cam_to_img_tf is None else cam_to_img_tf
    world_to_cam_tf = np.eye(4) if world_to_cam_tf is None else world_to_cam_tf

    pc = np.concatenate([pc.reshape(-1, 3), np.ones((h * w, 1))], axis=-1)  # shape (H*W, 4)

    # Convert using camera transform
    # Create (H * W, 4) vector from pc
    pc = (pc @ cam_to_img_tf.T @ world_to_cam_tf.T)[:, :3].reshape(h, w, 3)

    if visualize_every > 0:
        pc_flat = np.array(pc.reshape(-1, 3))
        pc_flat = np.where(np.linalg.norm(pc_flat, axis=-1, keepdims=True) > 1e4, 0.0, pc_flat)
        fig = plt.figure()
        ax = fig.add_subplot(111, projection="3d")
        ax.scatter(pc_flat[::visualize_every, 0], pc_flat[::visualize_every, 1], pc_flat[::visualize_every, 2], s=1)
        if grid_limits is not None:
            ax.set_xbound(lower=-grid_limits, upper=grid_limits)
            ax.set_ybound(lower=-grid_limits, upper=grid_limits)
            ax.set_zbound(lower=-grid_limits, upper=grid_limits)
        ax.set_xlabel('X')
        ax.set_ylabel('Y')
        ax.set_zlabel('Z')
        plt.show()

    return pc

def get_intrinsic_matrix(cam_p):
    K = np.zeros((3,3))
    # Aspect ratio is 1
    K[0,0] = cam_p['cameraFocalLength']
    K[1,1] = cam_p['cameraFocalLength']
    # cx, cy
    K[0,2] = cam_p['renderProductResolution'][0] / 2
    K[1,2] = cam_p['renderProductResolution'][1] / 2
    K[2,2] = 1.0
    return K

# Getting the environment
config = get_configs()
env = og.Environment(configs=config)

scene = env.scene
robot = env.robots[0]
grasp_obj = scene.object_registry("name", "cologne")
camera = robot.sensors['robot0:eyes:Camera:0']

# Setting the robot position to a fixed pose facing the cologne
replay_steps = 100
towards_cologne_pos = np.asarray([-0.9420595,-0.43640956,0.00098968])
towards_cologne_ori = np.asarray([-0.00805346,-0.01451508,-0.24558648,0.9692325])
set_robot_position_orientation(towards_cologne_pos, towards_cologne_ori, robot, replay_steps=replay_steps)

# Verify that the robot is facing towards the cologne
pos, ori = robot.get_position_orientation()
print(pos, ori)
obs,_ = env.get_obs()
rgb = copy.deepcopy(obs['robot0']['robot0:eyes:Camera:0']['rgb'])
depth = copy.deepcopy(obs['robot0']['robot0:eyes:Camera:0']['depth'])
# Project the ground truth 3d pose to 2d pixel
gt_px = get_pixel_from_pos(
    grasp_obj.get_position().reshape(1,-1),
    copy.deepcopy(camera.camera_parameters),
    height=camera.camera_parameters['renderProductResolution'][0],
    width=camera.camera_parameters['renderProductResolution'][1],
)

plt.imshow(rgb)
plt.scatter(gt_px[:,0], gt_px[:,1], color="r")
plt.savefig('init.png')

# Calculating the 3D pose of the cologne
cam_p = copy.deepcopy(camera.camera_parameters)
cam_to_img_tf = T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0])))
world_to_cam_tf = T.pose2mat(camera.get_position_orientation())
K = get_intrinsic_matrix(cam_p)
pcd_test = compute_point_cloud_from_depth(
    depth=depth, K=K,
    cam_to_img_tf=cam_to_img_tf,
    world_to_cam_tf=world_to_cam_tf,
)
# Print the 2d points
print(f"Predicted 3D pose of the cologne: {pcd_test[int(gt_px[0,1]),int(gt_px[0,0])]}")
print(f"Ground truth 3D pose of the cologne: {grasp_obj.get_position()}")

Sorry for the wait!! Here's an updated version of get_intrinsic_matrix from @cremebrule :

def get_intrinsic_matrix(cam_p):
    K = np.zeros((3,3))
    w, h = cam_p['renderProductResolution']
    horiz_aperture = cam_p["cameraAperture"][0]
    focal_length = cam_p["cameraFocalLength"]
    horizontal_fov = 2 * math.atan(horiz_aperture / (2 * focal_length))
    vertical_fov = horizontal_fov * h / w
    f_x = (w / 2.0) / np.tan(horizontal_fov / 2.0)
    f_y = (h / 2.0) / np.tan(vertical_fov / 2.0)
    K = np.array([
        [f_x, 0.0, w / 2.0],
        [0.0, f_y, h / 2.0],
        [0.0, 0.0, 1.0]
    ])

    return K

You can also add the depth_linear modality to your camera:

camera.add_modality("depth_linear")

We also have a PR now for updating our own intrinsic matrix implementation #692 . Let me know if that works for you!

I verified this on my end! The points are reasonably close. Thanks a lot!

Closing this issue.

StanfordVL / OmniGibson

Backprojecting 2D pixel to 3D coordinates w.r.t world frame #689