How to back-project an image to a 3D point cloud and render new views of the scene?

I would like to render new views of a scene, starting from a single image, its depth map, and camera poses.

I have an image taken from LLFF dataset, the metric depth map relative to that image (in meters) and 5 camera poses provided by the LLFF dataset in the poses_bounds.npy file (including the pose of the camera from which the image was taken). All files are attached.

Poses, as documented, are stored in the poses_bounds.npy file in this way:

The pose matrix is a 3x4 camera-to-world affine transform concatenated with a 3x1 column [image height, image width, focal length] along axis=1.

The rotation (first 3x3 block in the camera-to-world transform) is stored in a somewhat unusual order, which is why there are the transposes. From the point of view of the camera, the three axes are [ down, right, backwards ] which some people might consider to be [-y,x,z].

What I'm trying to do

Create a camera for each pose (extrinsic and intrinsic camera parameters provided in poses_bounds.npy)
Back-project the image to a 3D point cloud (I got a metric depth value for each pixel)
Render a image of the scene from each of the previously defined cameras

What I got

The images I'm obtaining are upside down, I suppose this is due to a wrong RT matrix conversion/multiplication OriginalSynth

My code

I read here and here methods to convert poses for LLFF dataset, but using them I got worse results.

import cv2
import numpy as np
import open3d as o3d
import torch
from pytorch3d.renderer import (
    PerspectiveCameras,
    PointsRasterizationSettings, 
    PointsRenderer, 
    PointsRasterizer,
    AlphaCompositor
)
from pytorch3d.structures import Pointclouds
from pytorch3d.io import IO

############################### Utility Functions ###############################

def convert_camera_pose(pose):
    pose = torch.tensor(pose, dtype=torch.float32, device=torch.device("cuda:0"))
    conv_matrix = torch.tensor([[0, -1, 0],
                                [1,  0, 0],
                                [0,  0, 1]
                                ], dtype=torch.float32, device=torch.device("cuda:0"))
    R = pose[:, :3]
    T = pose[:, 3]
    R = conv_matrix.T @ R @ conv_matrix
    return R.unsqueeze(0), T.unsqueeze(0)

def image_generator(renderer, point_cloud):
  gen_img = renderer(point_cloud.cuda()).permute(0, 3, 1, 2)
  gen_img = gen_img[0].permute(1, 2, 0).cpu().numpy().astype(np.uint8)
  print(f"gen_img max: {gen_img.max()} | gen_img min: {gen_img.min()} | gen_img shape: {gen_img.shape}")
  return gen_img

############################### ################# ###############################

if __name__ == "__main__":
    # Paths to image and depth file
    image_path = "src/roasted_beef/0000.png"
    depth_path = "src/roasted_beef/0000.npz"
    pose_path = "src/roasted_beef/poses_bounds.npy"

    # Load image
    image = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)
    h, w = image.shape[:2]
    print(f"Image shape: {image.shape}")

    # Load depth map (from meters to millimeters)
    depthmap = np.load(depth_path)['depth']*1000
    print(f"Depth map shape: {depthmap.shape}")

    # Load pose data
    # Poses are stored using [ down, right, backwards ] or [-y,x,z] in camera-to-world coordinates
    pose_data = np.load(pose_path)
    RT_0 = pose_data[:, :-2].reshape([-1, 3, 5])[0][:, :-1]
    RT_1 = pose_data[:, :-2].reshape([-1, 3, 5])[1][:, :-1]
    RT_2 = pose_data[:, :-2].reshape([-1, 3, 5])[2][:, :-1]
    RT_3 = pose_data[:, :-2].reshape([-1, 3, 5])[3][:, :-1]
    RT_4 = pose_data[:, :-2].reshape([-1, 3, 5])[4][:, :-1]

    # Convert to pythorch3d camera-to-world pose
    R_0, T_0 = convert_camera_pose(RT_0)
    R_1, T_1 = convert_camera_pose(RT_1)
    R_2, T_2 = convert_camera_pose(RT_2)
    R_3, T_3 = convert_camera_pose(RT_3)
    R_4, T_4 = convert_camera_pose(RT_4)

    # Define camera parameters
    focal_length = 731.3691627864987
    principal_point = ((image.shape[1] / 2, image.shape[0] / 2),) #676.0 507.0
    #principal_point = torch.FloatTensor([[0.0, 0.0]])
    image_size = ((image.shape[0], image.shape[1]),)
    print(f"Focal length: {focal_length} | Principal point: {principal_point} | Image size: {image_size}")

    # Create Perspective cameras
    camera = PerspectiveCameras(
        focal_length=focal_length,
        principal_point=principal_point,
        image_size=image_size,
        in_ndc=False,
        R=R_0,
        T=T_0,
        device=torch.device("cuda:0"),
    )
    camera_1 = PerspectiveCameras(
        focal_length=focal_length,
        principal_point=principal_point,
        image_size=image_size,
        in_ndc=False,
        R=R_1,
        T=T_1,
        device=torch.device("cuda:0"),
    )
    camera_2 = PerspectiveCameras(
        focal_length=focal_length,
        principal_point=principal_point,
        image_size=image_size,
        in_ndc=False,
        R=R_2,
        T=T_2,
        device=torch.device("cuda:0"),
    )
    camera_3 = PerspectiveCameras(
        focal_length=focal_length,
        principal_point=principal_point,
        image_size=image_size,
        in_ndc=False,
        R=R_3,
        T=T_3,
        device=torch.device("cuda:0"),
    )
    camera_4 = PerspectiveCameras(
        focal_length=focal_length,
        principal_point=principal_point,
        image_size=image_size,
        in_ndc=False,
        R=R_4,
        T=T_4,
        device=torch.device("cuda:0"),
    )
    print("Perspective cameras created.")

    # Image coordinates (u, v)
    h, w = image.shape[:2]
    u, v = np.meshgrid(np.arange(w), np.arange(h), indexing='xy')

    # Flatten the depth map and the mesh grid
    depth_flat = torch.tensor(depthmap.flatten(), dtype=torch.float32, device=torch.device("cuda:0"))
    u_flat = torch.tensor(u.flatten(), dtype=torch.float32, device=torch.device("cuda:0"))
    v_flat = torch.tensor(v.flatten(), dtype=torch.float32, device=torch.device("cuda:0"))

    # Points in screen coordinates (u, v, depth) [B, N, 3]
    xy_depth = torch.stack((u_flat, v_flat, depth_flat)).permute(1, 0).unsqueeze(0)

    # Unproject to the world coordinates
    xyz_unproj_world = camera.unproject_points(xy_depth, world_coordinates=True, from_ndc=False)

    # Create a point cloud using pytorch3d
    colors_tensor = torch.tensor(image, dtype=torch.float32, device=torch.device("cuda:0")).view(-1, 3)
    point_cloud = Pointclouds(points=[xyz_unproj_world[0, :, :]], features=[colors_tensor])

    # Save the point cloud to a file
    IO().save_pointcloud(point_cloud, "outs/TEST_point_cloud.ply")

    # Rasterizer setup
    raster_settings = PointsRasterizationSettings(
        image_size=(h, w),
        radius=0.01,
        points_per_pixel=10
    )

    # Renderer setup
    renderer = PointsRenderer(
        rasterizer=PointsRasterizer(cameras=camera, raster_settings=raster_settings),
        compositor=AlphaCompositor()
    )
    renderer_1 = PointsRenderer(
        rasterizer=PointsRasterizer(cameras=camera_1, raster_settings=raster_settings),
        compositor=AlphaCompositor()
    )
    renderer_2 = PointsRenderer(
        rasterizer=PointsRasterizer(cameras=camera_2, raster_settings=raster_settings),
        compositor=AlphaCompositor()
    )
    renderer_3 = PointsRenderer(
        rasterizer=PointsRasterizer(cameras=camera_3, raster_settings=raster_settings),
        compositor=AlphaCompositor()
    )
    renderer_4 = PointsRenderer(
        rasterizer=PointsRasterizer(cameras=camera_4, raster_settings=raster_settings),
        compositor=AlphaCompositor()
    )

    # Image generation
    gen_img_0 = image_generator(renderer, point_cloud)
    gen_img_1 = image_generator(renderer_1, point_cloud)
    gen_img_2 = image_generator(renderer_2, point_cloud)
    gen_img_3 = image_generator(renderer_3, point_cloud)
    gen_img_4 = image_generator(renderer_4, point_cloud)

    # Save rendered images
    cv2.imwrite("outs/rendered_image_0.png", cv2.cvtColor(gen_img_0, cv2.COLOR_RGB2BGR))
    cv2.imwrite("outs/rendered_image_1.png", cv2.cvtColor(gen_img_1, cv2.COLOR_RGB2BGR))
    cv2.imwrite("outs/rendered_image_2.png", cv2.cvtColor(gen_img_2, cv2.COLOR_RGB2BGR))
    cv2.imwrite("outs/rendered_image_3.png", cv2.cvtColor(gen_img_3, cv2.COLOR_RGB2BGR))
    cv2.imwrite("outs/rendered_image_4.png", cv2.cvtColor(gen_img_4, cv2.COLOR_RGB2BGR))

I believe it is just a matter of the pose matrices and the use of back projection functions. I would appreciate any help on this, as I have not found a tutorial in the documentation for such a case.

facebookresearch / pytorch3d