Closed ShahRutav closed 3 months ago
Hello! Thanks for reaching out. Here's some code from one of our teammates @cremebrule that does very similar things:
def compute_point_cloud_from_depth(depth, K, cam_to_img_tf=None, world_to_cam_tf=None, visualize_every=0, grid_limits=None):
# K - 3x3 cam intrinsics matrix
# tfs - 4x4 homogeneous global pose tf for cam
# Camera points in -z, so rotate by 180 deg so it points correctly in +z -- this means
# omni cam_to_img_tf is T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0])))
h, w = depth.shape
y, x = np.meshgrid(np.arange(h), np.arange(w), indexing="ij", sparse=False)
assert depth.min() >= 0
u = x
v = y
uv = np.dstack((u, v, np.ones_like(u)))
Kinv = np.linalg.inv(K)
pc = depth.reshape(-1, 1) * (uv.reshape(-1, 3) @ Kinv.T)
pc = pc.reshape(h, w, 3)
# If no tfs, use identity matrix
cam_to_img_tf = np.eye(4) if cam_to_img_tf is None else cam_to_img_tf
world_to_cam_tf = np.eye(4) if world_to_cam_tf is None else world_to_cam_tf
pc = np.concatenate([pc.reshape(-1, 3), np.ones((h * w, 1))], axis=-1) # shape (H*W, 4)
# Convert using camera transform
# Create (H * W, 4) vector from pc
pc = (pc @ cam_to_img_tf.T @ world_to_cam_tf.T)[:, :3].reshape(h, w, 3)
if visualize_every > 0:
pc_flat = np.array(pc.reshape(-1, 3))
pc_flat = np.where(np.linalg.norm(pc_flat, axis=-1, keepdims=True) > 1e4, 0.0, pc_flat)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(pc_flat[::visualize_every, 0], pc_flat[::visualize_every, 1], pc_flat[::visualize_every, 2], s=1)
if grid_limits is not None:
ax.set_xbound(lower=-grid_limits, upper=grid_limits)
ax.set_ybound(lower=-grid_limits, upper=grid_limits)
ax.set_zbound(lower=-grid_limits, upper=grid_limits)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()
return pc
Do you mind trying this function? You can derive K
from the camera_parameters
. Also, to help with distortion, you could try the depth_linear
modality instead of depth
. Please let me know if that helps!
Can you share how to do you calculate cam_to_img_tf
and world_to_cam_tf
? It is not directly provided in camera_parameters, it might introduce some differences.
I think you can try this:
set cam_to_img_tf
to T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0])))
and
world_to_cam_tf
to T.pose2mat(camera.get_position_orientation())
Let me know if that works!
Hi, the code you shared has an issue similar to the one I highlighted. The point cloud points of the entire scene look reasonable but are off from the global xyz axis.
For example, the center point on the object received from grasp_obj.get_position()
[-0.5000377 -1.0000356 0.44774088] is way off compared to the back-projected value [-2.41141153 -1.6718964 -1.32171841] calculated
Here's a minimally reproducing code based on the pointers you mentioned,
import os
import time
import math
import yaml
import cv2
import random
import copy
import numpy as np
import matplotlib.pyplot as plt
import open3d as o3d
import omnigibson as og
from omnigibson.action_primitives.starter_semantic_action_primitives import StarterSemanticActionPrimitiveSet, StarterSemanticActionPrimitives
import skills.skill_utils as SU
from skills.skill_classes import StarterSkillSet, StarterSkills
import omnigibson.utils.transform_utils as T
from omnigibson.macros import gm
gm.USE_GPU_DYNAMICS = True
def get_configs():
"""
Demonstrates how to use the action primitives to pick and place an object in an empty scene.
It loads Rs_int with a Fetch robot, and the robot picks and places a bottle of cologne.
"""
# Load the config
config_filename = os.path.join(og.example_config_path, "fetch_primitives.yaml")
config = yaml.load(open(config_filename, "r"), Loader=yaml.FullLoader)
# config['robots'][0]['type'] = 'Tiago'
# Update it to create a custom environment and run some actions
config["scene"]["scene_model"] = "Rs_int"
# config["scene"]["load_object_categories"] = ["floors", "ceilings", "walls", "coffee_table"]
config["scene"]["load_object_categories"] = ["floors", "coffee_table"]
config["objects"] = [
{
"type": "DatasetObject",
"name": "cologne",
"category": "bottle_of_cologne",
"model": "lyipur",
"position": [-0.3, -0.8, 0.5],
"orientation": [0, 0, 0, 1]
},
{
"type": "DatasetObject",
"name": "table",
"category": "breakfast_table",
"model": "rjgmmy",
"scale": [0.3, 0.3, 0.3],
"position": [-0.7, 0.5, 0.2],
"orientation": [0, 0, 0, 1]
}
]
return config
def set_robot_position_orientation(pos, ori, robot, replay_steps=1):
robot.set_position_orientation(pos, ori)
for _ in range(replay_steps):
og.sim.play()
og.sim.step()
return
def world_to_image_pinhole(world_points, camera_params):
# Project corners to image space (assumes pinhole camera model)
proj_mat = camera_params["cameraProjection"].reshape(4, 4)
view_mat = camera_params["cameraViewTransform"].reshape(4, 4)
view_proj_mat = view_mat @ proj_mat
world_points_homo = np.pad(world_points, ((0, 0), (0, 1)), constant_values=1.0)
tf_points = world_points_homo @ view_proj_mat
tf_points = tf_points / (tf_points[..., -1:])
return 0.5 * (tf_points[..., :2] + 1)
def get_pixel_from_pos(pos_3d, camera_parameters, height, width):
pixels = world_to_image_pinhole(pos_3d, camera_parameters)
pixels = pixels*np.array([[width, height]])
# Flip the axis to the co-ordinate system (Maybe because obs-rgb is flipped?).
pixels[:,1] = height - pixels[:,1]
return pixels
def compute_point_cloud_from_depth(
depth,
K,
cam_to_img_tf=None,
world_to_cam_tf=None,
visualize_every=0,
grid_limits=None
):
# K - 3x3 cam intrinsics matrix
# tfs - 4x4 homogeneous global pose tf for cam
# Camera points in -z, so rotate by 180 deg so it points correctly in +z -- this means
# omni cam_to_img_tf is T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0])))
h, w = depth.shape
y, x = np.meshgrid(np.arange(h), np.arange(w), indexing="ij", sparse=False)
assert depth.min() >= 0
u = x
v = y
uv = np.dstack((u, v, np.ones_like(u)))
Kinv = np.linalg.inv(K)
pc = depth.reshape(-1, 1) * (uv.reshape(-1, 3) @ Kinv.T)
pc = pc.reshape(h, w, 3)
# If no tfs, use identity matrix
cam_to_img_tf = np.eye(4) if cam_to_img_tf is None else cam_to_img_tf
world_to_cam_tf = np.eye(4) if world_to_cam_tf is None else world_to_cam_tf
pc = np.concatenate([pc.reshape(-1, 3), np.ones((h * w, 1))], axis=-1) # shape (H*W, 4)
# Convert using camera transform
# Create (H * W, 4) vector from pc
pc = (pc @ cam_to_img_tf.T @ world_to_cam_tf.T)[:, :3].reshape(h, w, 3)
if visualize_every > 0:
pc_flat = np.array(pc.reshape(-1, 3))
pc_flat = np.where(np.linalg.norm(pc_flat, axis=-1, keepdims=True) > 1e4, 0.0, pc_flat)
fig = plt.figure()
ax = fig.add_subplot(111, projection="3d")
ax.scatter(pc_flat[::visualize_every, 0], pc_flat[::visualize_every, 1], pc_flat[::visualize_every, 2], s=1)
if grid_limits is not None:
ax.set_xbound(lower=-grid_limits, upper=grid_limits)
ax.set_ybound(lower=-grid_limits, upper=grid_limits)
ax.set_zbound(lower=-grid_limits, upper=grid_limits)
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')
plt.show()
return pc
def get_intrinsic_matrix(cam_p):
K = np.zeros((3,3))
# Aspect ratio is 1
K[0,0] = cam_p['cameraFocalLength']
K[1,1] = cam_p['cameraFocalLength']
# cx, cy
K[0,2] = cam_p['renderProductResolution'][0] / 2
K[1,2] = cam_p['renderProductResolution'][1] / 2
K[2,2] = 1.0
return K
# Getting the environment
config = get_configs()
env = og.Environment(configs=config)
scene = env.scene
robot = env.robots[0]
grasp_obj = scene.object_registry("name", "cologne")
camera = robot.sensors['robot0:eyes:Camera:0']
# Setting the robot position to a fixed pose facing the cologne
replay_steps = 100
towards_cologne_pos = np.asarray([-0.9420595,-0.43640956,0.00098968])
towards_cologne_ori = np.asarray([-0.00805346,-0.01451508,-0.24558648,0.9692325])
set_robot_position_orientation(towards_cologne_pos, towards_cologne_ori, robot, replay_steps=replay_steps)
# Verify that the robot is facing towards the cologne
pos, ori = robot.get_position_orientation()
print(pos, ori)
obs,_ = env.get_obs()
rgb = copy.deepcopy(obs['robot0']['robot0:eyes:Camera:0']['rgb'])
depth = copy.deepcopy(obs['robot0']['robot0:eyes:Camera:0']['depth'])
# Project the ground truth 3d pose to 2d pixel
gt_px = get_pixel_from_pos(
grasp_obj.get_position().reshape(1,-1),
copy.deepcopy(camera.camera_parameters),
height=camera.camera_parameters['renderProductResolution'][0],
width=camera.camera_parameters['renderProductResolution'][1],
)
plt.imshow(rgb)
plt.scatter(gt_px[:,0], gt_px[:,1], color="r")
plt.savefig('init.png')
# Calculating the 3D pose of the cologne
cam_p = copy.deepcopy(camera.camera_parameters)
cam_to_img_tf = T.pose2mat(([0, 0, 0], T.euler2quat([np.pi, 0, 0])))
world_to_cam_tf = T.pose2mat(camera.get_position_orientation())
K = get_intrinsic_matrix(cam_p)
pcd_test = compute_point_cloud_from_depth(
depth=depth, K=K,
cam_to_img_tf=cam_to_img_tf,
world_to_cam_tf=world_to_cam_tf,
)
# Print the 2d points
print(f"Predicted 3D pose of the cologne: {pcd_test[int(gt_px[0,1]),int(gt_px[0,0])]}")
print(f"Ground truth 3D pose of the cologne: {grasp_obj.get_position()}")
Sorry for the wait!! Here's an updated version of get_intrinsic_matrix
from @cremebrule :
def get_intrinsic_matrix(cam_p):
K = np.zeros((3,3))
w, h = cam_p['renderProductResolution']
horiz_aperture = cam_p["cameraAperture"][0]
focal_length = cam_p["cameraFocalLength"]
horizontal_fov = 2 * math.atan(horiz_aperture / (2 * focal_length))
vertical_fov = horizontal_fov * h / w
f_x = (w / 2.0) / np.tan(horizontal_fov / 2.0)
f_y = (h / 2.0) / np.tan(vertical_fov / 2.0)
K = np.array([
[f_x, 0.0, w / 2.0],
[0.0, f_y, h / 2.0],
[0.0, 0.0, 1.0]
])
return K
You can also add the depth_linear
modality to your camera:
camera.add_modality("depth_linear")
We also have a PR now for updating our own intrinsic matrix implementation #692 . Let me know if that works for you!
I verified this on my end! The points are reasonably close. Thanks a lot!
Closing this issue.
Backprojecting 2d pixels to 3d pose I implemented the following version (pinhole assumption as in the [code-piece]
Attaching the visualization of the output of the 3d positions (as an html file) for depth image,
3d point vis
However, the 3d positions calculated are different than the world frame, e.g., the point
obj.get_position_orientation()
is completely different? Moreover, the plane seems to be distorted; do you know why?