NVlabs / FoundationPose

[CVPR 2024 Highlight] FoundationPose: Unified 6D Pose Estimation and Tracking of Novel Objects
https://nvlabs.github.io/FoundationPose/
Other
1.52k stars 209 forks source link

Real-Time pose estimation on Realsense D435i #163

Closed Kaivalya192 closed 5 months ago

Kaivalya192 commented 5 months ago

I want to apply Foundation Pose model for realtime input and output This is the modified code :

from estimater import *
from datareader import *
import pyrealsense2 as rs
from mask import *

if __name__ == '__main__':
    create_mask()

    pipeline = rs.pipeline()
    config = rs.config()
    config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
    config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)
    profile = pipeline.start(config)

    depth_sensor = profile.get_device().first_depth_sensor()
    depth_scale = depth_sensor.get_depth_scale()

    clipping_distance_in_meters = 1  # 1 meter
    clipping_distance = clipping_distance_in_meters / depth_scale

    align_to = rs.stream.color
    align = rs.align(align_to)

    cam_K = np.array([[615.3791504, 0, 314.7796326], [0, 615.3792114, 236.6492004], [0, 0, 1]])

    parser = argparse.ArgumentParser()
    code_dir = os.path.dirname(os.path.realpath(__file__))
    parser.add_argument('--mesh_file', type=str, default=f'{code_dir}/../mesh/textured_mesh.obj')
    parser.add_argument('--test_scene_dir', type=str, default=f'{code_dir}/./input')
    parser.add_argument('--est_refine_iter', type=int, default=5)
    parser.add_argument('--track_refine_iter', type=int, default=2)
    parser.add_argument('--debug', type=int, default=1)
    parser.add_argument('--debug_dir', type=str, default=f'{code_dir}/debug')
    args = parser.parse_args()

    set_logging_format()
    set_seed(0)

    mesh = trimesh.load(args.mesh_file)

    debug = args.debug
    debug_dir = args.debug_dir
    os.system(f'rm -rf {debug_dir}/* && mkdir -p {debug_dir}/track_vis {debug_dir}/ob_in_cam')

    to_origin, extents = trimesh.bounds.oriented_bounds(mesh)
    bbox = np.stack([-extents / 2, extents / 2], axis=0).reshape(2, 3)

    scorer = ScorePredictor()
    refiner = PoseRefinePredictor()
    glctx = dr.RasterizeCudaContext()
    est = FoundationPose(model_pts=mesh.vertices, model_normals=mesh.vertex_normals, mesh=mesh, scorer=scorer, refiner=refiner, debug_dir=debug_dir, debug=debug, glctx=glctx)
    logging.info("estimator initialization done")

    for i in range(1000):

        if cv2.waitKey(1) == 13:
            break

        frames = pipeline.wait_for_frames()
        aligned_frames = align.process(frames)
        aligned_depth_frame = aligned_frames.get_depth_frame()
        color_frame = aligned_frames.get_color_frame()

        if not aligned_depth_frame or not color_frame:
            continue

        depth = np.asanyarray(aligned_depth_frame.get_data()).astype(np.float32)
        color = np.asanyarray(color_frame.get_data())

        logging.info(f'i:{i}')
        if i == 0:
            mask = cv2.imread('./mask.png')
            if len(mask.shape) == 3:
                for c in range(3):
                    if mask[..., c].sum() > 0:
                        mask = mask[..., c]
                    break
            mask = cv2.resize(mask, (640, 480), interpolation=cv2.INTER_NEAREST)
            pose = est.register(K=cam_K, rgb=color, depth=depth, ob_mask=mask, iteration=args.est_refine_iter)

            if debug >= 3:
                m = mesh.copy()
                m.apply_transform(pose)
                m.export(f'{debug_dir}/model_tf.obj')
                xyz_map = depth2xyzmap(depth, cam_K)
                valid = depth >= 0.1
                pcd = toOpen3dCloud(xyz_map[valid], color[valid])
                o3d.io.write_point_cloud(f'{debug_dir}/scene_complete.ply', pcd)
        else:
            pose = est.track_one(rgb=color, depth=depth, K=cam_K, iteration=args.track_refine_iter)

        os.makedirs(f'{debug_dir}/ob_in_cam', exist_ok=True)
        np.savetxt(f'{debug_dir}/ob_in_cam/{i}.txt', pose.reshape(4, 4))

        if debug >= 1:
            center_pose = pose @ np.linalg.inv(to_origin)
            vis = draw_posed_3d_box(cam_K, img=color, ob_in_cam=center_pose, bbox=bbox)
            vis = draw_xyz_axis(color, ob_in_cam=center_pose, scale=0.1, K=cam_K, thickness=3, transparency=0, is_input_rgb=True)
            cv2.imshow('1', vis[..., ::-1])
            cv2.waitKey(1)

        if debug >= 2:
            os.makedirs(f'{debug_dir}/track_vis', exist_ok=True)
            imageio.imwrite(f'{debug_dir}/track_vis/{i}.png', vis)

    pipeline.stop()
    cv2.destroyAllWindows()

in this code i am facing issue with this registering the first frame pose = est.register(K=cam_K, rgb=color, depth=depth, ob_mask=mask, iteration=args.est_refine_iter)

i think masking logic working this is code for depth and rgb alignment

# First import the library
import pyrealsense2 as rs
# Import Numpy for easy array manipulation
import numpy as np
# Import OpenCV for easy image rendering
import cv2

# Create a pipeline
pipeline = rs.pipeline()

# Create a config and configure the pipeline to stream
#  different resolutions of color and depth streams
config = rs.config()

# Get device product line for setting a supporting resolution
pipeline_wrapper = rs.pipeline_wrapper(pipeline)
pipeline_profile = config.resolve(pipeline_wrapper)
device = pipeline_profile.get_device()
device_product_line = str(device.get_info(rs.camera_info.product_line))

found_rgb = False
for s in device.sensors:
    if s.get_info(rs.camera_info.name) == 'RGB Camera':
        found_rgb = True
        break
if not found_rgb:
    print("The demo requires Depth camera with Color sensor")
    exit(0)

config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)

# Start streaming
profile = pipeline.start(config)

# Getting the depth sensor's depth scale (see rs-align example for explanation)
depth_sensor = profile.get_device().first_depth_sensor()
depth_scale = depth_sensor.get_depth_scale()
print("Depth Scale is: " , depth_scale)

# We will be removing the background of objects more than
#  clipping_distance_in_meters meters away
clipping_distance_in_meters = 1 #1 meter
clipping_distance = clipping_distance_in_meters / depth_scale

# Create an align object
# rs.align allows us to perform alignment of depth frames to others frames
# The "align_to" is the stream type to which we plan to align depth frames.
align_to = rs.stream.color
align = rs.align(align_to)

# Streaming loop
try:
    while True:
        # Get frameset of color and depth
        frames = pipeline.wait_for_frames()
        # frames.get_depth_frame() is a 640x360 depth image

        # Align the depth frame to color frame
        aligned_frames = align.process(frames)

        # Get aligned frames
        aligned_depth_frame = aligned_frames.get_depth_frame() # aligned_depth_frame is a 640x480 depth image
        color_frame = aligned_frames.get_color_frame()

        # Validate that both frames are valid
        if not aligned_depth_frame or not color_frame:
            continue

        depth_image = np.asanyarray(aligned_depth_frame.get_data())
        color_image = np.asanyarray(color_frame.get_data())

        # Remove background - Set pixels further than clipping_distance to grey
        grey_color = 153
        depth_image_3d = np.dstack((depth_image,depth_image,depth_image)) #depth image is 1 channel, color is 3 channels
        bg_removed = np.where((depth_image_3d > clipping_distance) | (depth_image_3d <= 0), grey_color, color_image)

        # Render images:
        #   depth align to color on left
        #   depth on right
        depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)
        images = np.hstack((bg_removed, depth_colormap))

        cv2.namedWindow('Align Example', cv2.WINDOW_NORMAL)
        cv2.imshow('Align Example', images)
        key = cv2.waitKey(1)
        # Press esc or 'q' to close the image window
        if key & 0xFF == ord('q') or key == 27:
            cv2.destroyAllWindows()
            break
finally:
    pipeline.stop()

i am getting error like this

[__init__()] self.h5_file:
[__init__()] Using pretrained model from /home/iitgn-robotics-1/Live_Pose/FoundationPose/learning/training/../../weights/2023-10-28-18-33-37/model_best.pth
[__init__()] init done
[reset_object()] self.diameter:0.11680047637416553, vox_size:0.005840023818708276
[reset_object()] self.pts:torch.Size([1591, 3])
[reset_object()] reset done
[make_rotation_grid()] cam_in_obs:(42, 4, 4)
[make_rotation_grid()] rot_grid:(252, 4, 4)
num original candidates = 252
num of pose after clustering: 252
[make_rotation_grid()] after cluster, rot_grid:(252, 4, 4)
[make_rotation_grid()] self.rot_grid: torch.Size([252, 4, 4])
[<module>()] estimator initialization done
[<module>()] i:0
[register()] Welcome
Module Utils load on device 'cuda:0' took 3.29 ms
[register()] valid too small, return
[guess_translation()] valid is empty
[<module>()] i:1
[track_one()] Please init pose by register first
Traceback (most recent call last):
  File "kaivalya.py", line 91, in <module>
    pose = est.track_one(rgb=color, depth=depth, K=cam_K, iteration=args.track_refine_iter)
  File "/home/iitgn-robotics-1/Live_Pose/FoundationPose/estimater.py", line 253, in track_one
    raise RuntimeError
wenbowen123 commented 5 months ago

For the first frame you need to first run register to initialize the pose, see https://github.com/NVlabs/FoundationPose/blob/main/run_demo.py#L52

Kaivalya192 commented 5 months ago

Traceback (most recent call last): File "kaivalya.py", line 114, in pose = est.register(K=cam_K, rgb=color, depth=depth, ob_mask=mask, iteration=args.est_refine_iter) File "/home/iitgn-robotics-1/Live_Pose/FoundationPose/estimater.py", line 179, in register pcd = toOpen3dCloud(xyz_map[valid], rgb[valid]) File "/home/iitgn-robotics-1/Live_Pose/FoundationPose/Utils.py", line 284, in toOpen3dCloud if colors.max()>1: File "/opt/conda/envs/my/lib/python3.8/site-packages/numpy/core/_methods.py", line 41, in _amax return umr_maximum(a, axis, None, out, keepdims, initial, where) ValueError: zero-size array to reduction operation maximum which has no identity

i am getting this error when the frame is registering for the first time

when i record the dataset with same depth and rgb format the code works on that you gave.

Kaivalya192 commented 5 months ago

Resolved by dividing it by 1000 depth was in mm

depth_image = np.asanyarray(aligned_depth_frame.get_data())/1e3

SeungHunJeon commented 4 months ago

How did you create the mask while using realsense?

Kaivalya192 commented 4 months ago

How did you create the mask while using realsense?

I used XMem segmentation . live pose this my repo where I have implemented all thing and it works live with realsense camara.

SeungHunJeon commented 4 months ago

Thank you for your response. However, I am facing an issue with estimation divergence when the object goes out of the camera's scene. Did you manage to resolve this issue?

176

oliviaylee commented 3 months ago

For the first frame you need to first run register to initialize the pose, see https://github.com/NVlabs/FoundationPose/blob/main/run_demo.py#L52

Hi @wenbowen123, I am getting the same error as mentioned here for a custom object (run_demo.py works fine for the demo objects). I pass my custom object mesh file and test scene dir as arguments to run_demo.py, so this line is being run. I also record depth values in m instead of mm as suggested here. Could there be another source of this error?

Edit: When run with --debug 3, I get the same error as above: "ValueError: zero-size array to reduction operation maximum which has no identity".