Closed Sisso16 closed 2 months ago
Can't remember, I think when we originally did it we didn't it look at the paper properly and wanted to do our own version and when it was done we never got around fixing it, you mean something like this instead?
def generate_2D_gaussian_splatting(kernel_size, scale, rotation, coords, colours, image_size=(256, 256, 3), device="cpu"):
batch_size = colours.shape[0]
# Ensure scale and rotation have the correct shape
scale = scale.view(batch_size, 2)
rotation = rotation.view(batch_size)
# Compute the components of the covariance matrix
cos_rot = torch.cos(rotation)
sin_rot = torch.sin(rotation)
R = torch.stack([
torch.stack([cos_rot, -sin_rot], dim=-1),
torch.stack([sin_rot, cos_rot], dim=-1)
], dim=-2)
S = torch.diag_embed(scale)
# Compute covariance matrix: RSS^TR^T
covariance = R @ S @ S @ R.transpose(-1, -2)
# Compute inverse covariance
inv_covariance = torch.inverse(covariance)
# Create the kernel
x = torch.linspace(-5, 5, kernel_size, device=device)
y = torch.linspace(-5, 5, kernel_size, device=device)
xx, yy = torch.meshgrid(x, y, indexing='ij')
xy = torch.stack([xx, yy], dim=-1).unsqueeze(0).expand(batch_size, -1, -1, -1)
z = torch.einsum('bxyi,bij,bxyj->bxy', xy, -0.5 * inv_covariance, xy)
kernel = torch.exp(z) / (2 * torch.tensor(np.pi, device=device) * torch.sqrt(torch.det(covariance))).view(batch_size, 1, 1)
# Normalize the kernel
kernel_max = kernel.amax(dim=(-2, -1), keepdim=True)
kernel_normalized = kernel / kernel_max
# Reshape the kernel for RGB channels
kernel_rgb = kernel_normalized.unsqueeze(1).expand(-1, 3, -1, -1)
# Add padding to match image size
pad_h = image_size[0] - kernel_size
pad_w = image_size[1] - kernel_size
if pad_h < 0 or pad_w < 0:
raise ValueError("Kernel size should be smaller or equal to the image size.")
padding = (pad_w // 2, pad_w // 2 + pad_w % 2, pad_h // 2, pad_h // 2 + pad_h % 2)
kernel_rgb_padded = F.pad(kernel_rgb, padding, "constant", 0)
# Translate the kernel
b, c, h, w = kernel_rgb_padded.shape
theta = torch.zeros(b, 2, 3, dtype=torch.float32, device=device)
theta[:, 0, 0] = 1.0
theta[:, 1, 1] = 1.0
theta[:, :, 2] = coords
grid = F.affine_grid(theta, size=(b, c, h, w), align_corners=True)
kernel_rgb_padded_translated = F.grid_sample(kernel_rgb_padded, grid, align_corners=True)
# Apply colors and sum the layers
rgb_values_reshaped = colours.unsqueeze(-1).unsqueeze(-1)
final_image_layers = rgb_values_reshaped * kernel_rgb_padded_translated
final_image = final_image_layers.sum(dim=0)
final_image = torch.clamp(final_image, 0, 1)
final_image = final_image.permute(1, 2, 0)
return final_image
I will give it a try, that might help us with setting up higher learning rate as well, also cleaner when we do a 3D version in one notebook but unfortunately you get the same amount of parameters to optimise
Cheers, Ash
Pushed a fix, now in main
Hi there, first of all nice work! Coming to my concern, I was wondering why instead of using a similar covariance decomposition as the one used in the original 3D Gaussian Splatting paper (RSS^TR^T) to make sure it is positive semi-definite, you instead directly optimize sigma_x, sigma_y and rho, and how does this effect the training, given that you have an assertion error in the
generate_2D_gaussian_splatting
function but there is no explicit control throughout the training. Thanks for your answer!