zbufs returned by Rasterizer on different devices are different, e.g., 37.4441 on CUDA and 37.4562 on CPU.
Coincidentally, I found that this problem also exists in einsum (if rasterization also uses this function, this may cause the problem). Specifically, use einsum to rotate points with an identity matrix, and einsum on CUDA will bring a small error instead of 0 as on CPU (see perspective_projection as below).
Instructions To Reproduce the Issue:
Please include the following (depending on what the issue is):
Any changes you made (git diff) or code you wrote:
import os
os.environ["PYOPENGL_PLATFORM"] = "egl"
from typing import Optional
from matplotlib import pyplot as plt
import numpy as np
import torch
from pytorch3d.renderer import (
RasterizationSettings,
MeshRasterizer,
MeshRenderer,
TexturesVertex,
HardPhongShader
)
from pytorch3d.structures import Meshes
from pytorch3d.utils.camera_conversions import cameras_from_opencv_projection
def perspective_projection(points: torch.Tensor,
translation: torch.Tensor,
focal_length: torch.Tensor,
camera_center: Optional[torch.Tensor] = None,
rotation: Optional[torch.Tensor] = None) -> torch.Tensor:
"""
Computes the perspective projection of a set of 3D points.
Args:
points (torch.Tensor): Tensor of shape (B, N, 3) containing the input 3D points.
translation (torch.Tensor): Tensor of shape (B, 3) containing the 3D camera translation.
focal_length (torch.Tensor): Tensor of shape (B, 2) containing the focal length in pixels.
camera_center (torch.Tensor): Tensor of shape (B, 2) containing the camera center in pixels.
rotation (torch.Tensor): Tensor of shape (B, 3, 3) containing the camera rotation.
Returns:
torch.Tensor: Tensor of shape (B, N, 2) containing the projection of the input points.
"""
batch_size = points.shape[0]
if rotation is None:
rotation = torch.eye(3, device=points.device, dtype=points.dtype).unsqueeze(0).expand(batch_size, -1, -1)
if camera_center is None:
camera_center = torch.zeros(batch_size, 2, device=points.device, dtype=points.dtype)
🐛 Bugs / Unexpected behaviors
zbuf
s returned byRasterizer
on different devices are different, e.g.,37.4441
on CUDA and37.4562
on CPU. Coincidentally, I found that this problem also exists ineinsum
(if rasterization also uses this function, this may cause the problem). Specifically, useeinsum
to rotate points with an identity matrix, andeinsum
on CUDA will bring a small error instead of 0 as on CPU (seeperspective_projection
as below).Instructions To Reproduce the Issue:
Please include the following (depending on what the issue is):
git diff
) or code you wrote:def perspective_projection(points: torch.Tensor, translation: torch.Tensor, focal_length: torch.Tensor, camera_center: Optional[torch.Tensor] = None, rotation: Optional[torch.Tensor] = None) -> torch.Tensor: """ Computes the perspective projection of a set of 3D points. Args: points (torch.Tensor): Tensor of shape (B, N, 3) containing the input 3D points. translation (torch.Tensor): Tensor of shape (B, 3) containing the 3D camera translation. focal_length (torch.Tensor): Tensor of shape (B, 2) containing the focal length in pixels. camera_center (torch.Tensor): Tensor of shape (B, 2) containing the camera center in pixels. rotation (torch.Tensor): Tensor of shape (B, 3, 3) containing the camera rotation. Returns: torch.Tensor: Tensor of shape (B, N, 2) containing the projection of the input points. """ batch_size = points.shape[0] if rotation is None: rotation = torch.eye(3, device=points.device, dtype=points.dtype).unsqueeze(0).expand(batch_size, -1, -1) if camera_center is None: camera_center = torch.zeros(batch_size, 2, device=points.device, dtype=points.dtype)
Populate intrinsic camera matrix K.
Args
focal_length = 5000 image_size = 224 faces_per_pixel = 2 vizimg = True
dtype = torch.float32 device = torch.device('cuda') # torch.device('cpu')
npz = np.load('mesh.npz', mmap_mode='r') joints, vertices, faces, cam_t = (torch.from_numpy(npz['joints']).to(device), torch.from_numpy(npz['vertices']).to(device), torch.from_numpy(npz['faces']).to(device), torch.from_numpy(npz['cam_t']).to(device)) B = joints.shape[0]
textures = TexturesVertex(torch.ones_like(vertices)) meshes = Meshes(vertices, faces, textures=textures)
Using the OpenCV coord sys
cameras = cameras_from_opencv_projection(torch.eye(3, device=device)[None, ...].repeat(B, 1, 1), cam_t, torch.tensor([[focal_length, 0, image_size / 2], [0, focal_length, image_size / 2], [0, 0, 1]], device=device)[None, ...].repeat(B, 1, 1), torch.ones(B, 2, device=device) * image_size) raster_settings = RasterizationSettings(image_size=image_size, faces_per_pixel=faces_per_pixel) rasterizer = MeshRasterizer( cameras=cameras, raster_settings=raster_settings )
zbufs = rasterizer(meshes).zbuf # shape: (N, H, W, K)
renderer = MeshRenderer( rasterizer, HardPhongShader(device=device, cameras=cameras) ) images = renderer(meshes)
keypoints_2d = perspective_projection(joints, cam_t, torch.ones(B, 2, device=device) focal_length, camera_center=torch.ones(B, 2, device=device) image_size / 2)
jid = 25 + 6 print("keypoints_2d[0, jid]", keypoints_2d[0, jid]) print("zbufs[0, 23, 134]", zbufs[0, 23, 134]) print("joints[0, jid, 2] + cam_t[0, 2]", joints[0, jid, 2] + cam_t[0, 2])
if vizimg: plt.close() plt.imshow(images[0].cpu().numpy()) plt.savefig('image.png')
rotation_bef[0, 0] tensor([1., 0., 0.], device='cuda:0') points_bef[0, 0] tensor([ 0.1155, -0.8138, -0.2963], device='cuda:0') points[0, 0] after einsum tensor([ 0.1155, -0.8140, -0.2964], device='cuda:0') (rotation - rotation_bef).abs().mean() tensor(0., device='cuda:0') (points - points_bef).abs().mean() tensor(6.2104e-05, device='cuda:0') keypoints_2d[0, jid] tensor([133.5530, 23.0413], device='cuda:0') zbufs[0, 23, 134] tensor([37.4441, 37.5067], device='cuda:0') joints[0, jid, 2] + cam_t[0, 2] tensor(37.5164, device='cuda:0')
rotation_bef[0, 0] tensor([1., 0., 0.]) points_bef[0, 0] tensor([ 0.1155, -0.8138, -0.2963]) (rotation - rotation_bef).abs().mean() tensor(0.) (points - points_bef).abs().mean() tensor(0.) points[0, 0] after einsum tensor([ 0.1155, -0.8138, -0.2963]) keypoints_2d[0, jid] tensor([133.5542, 22.9965]) zbufs[0, 23, 134] tensor([37.4562, 37.5189]) joints[0, jid, 2] + cam_t[0, 2] tensor(37.5164)