I want to use the test set in the semantic-kitti dataset for inference visualization #100

Closed SZUshenyan closed 4 months ago

SZUshenyan commented 6 months ago

Hello, I want to use the test set in the semantic-kitti dataset for inference visualization. I have successfully used mayavi for inference visualization of the validation set, but there was a problem visualizing the test set. The error was reported without the preprocessed.npy file, but the.npy file was obtained by processing the.label and.invalid files. So I want to ask do you necessarily need.label and.invalid files for inference visualization?

lzbushicai commented 6 months ago

The SemanticKitti data are labeled only for the first 11 sequences and not for the later ones.inference only requires images not labels

anhquancao commented 6 months ago

Yes, there is no labels for test sequences. You can ignore these files during inference

SZUshenyan commented 6 months ago


Hello, I have run the inference code, but nothing appears on Gradio. I now want to inference about my image data and save it as a pkl file, and then use mayavi for visualization, can you help me? 图片1

anhquancao commented 5 months ago

Hi, I don't understand. Can you use my visualization script to visualize it?

SZUshenyan commented 5 months ago

Hello, I used two methods to inference my own image data. The first is the inference code you provided in huggingface, which is already running successfully on Gradio first ; The second method is to first use my own to inference the image data and save it as a pkl file, and then use to visualize the pkl file, but the second method produces a different 3D semantic occupancy prediction than the first method second . I have posted the code of and below, please help me to see where I should deal with the problem. Looking forward to your reply, thank you! In addition, I also want to use the 3D semantic occupancy prediction results obtained from monoscene for path planning (using the improved APF method), do you think it is feasible?

SZUshenyan commented 5 months ago

import os
import cv2
import numpy as np
import torch
from torchvision import transforms
import pickle
# from helpers import *
from helpers import get_projections, majority_pooling, draw
from monoscene.monoscene import MonoScene


model = MonoScene.load_from_checkpoint(
    full_scene_size=(256, 256, 32),

img_W, img_H = 1220, 370
input_folder = "/home/zhx/Data/sy/MonoScene2/images/mydataset/"
output_folder = "/home/zhx/Data/sy/MonoScene2/images/myoutput_pkls"

if not os.path.exists(output_folder):

for filename in os.listdir(input_folder):
    if filename.endswith(".jpg") or filename.endswith(".png"):
        img_path = os.path.join(input_folder, filename)
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # img = np.array(img, dtype=np.float32) / 255.0
        img = np.array(img, dtype=np.float32, copy=False) / 255.0

        normalize_rgb = transforms.Compose(
                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
        img = normalize_rgb(img)

        batch = get_projections(img_W, img_H)
        batch["img"] = img
        for k in batch:
           batch[k] = batch[k].unsqueeze(0)#.cuda()

        pred = model(batch).squeeze()
        pred = majority_pooling(pred, k_size=2)
        #pred = np.argmax(pred, axis=1)

        # Save prediction result as a .pkl file
        output_dict = {
            "pred": pred.astype(np.uint16),
            "fov_mask": batch["fov_mask_2"].detach().cpu().numpy()
        output_filename = os.path.splitext(filename)[0] + ".pkl"
        output_path = os.path.join(output_folder, output_filename)
        with open(output_path, "wb") as f:
            pickle.dump(output_dict, f)

        print(f"Prediction saved as {output_path}")
SZUshenyan commented 5 months ago

# from operator import gt
import pickle
import numpy as np
from omegaconf import DictConfig
import hydra
from mayavi import mlab

def get_grid_coords(dims, resolution):
    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
    :return coords_grid: is the center coords of voxels in the grid

    g_xx = np.arange(0, dims[0] + 1)
    g_yy = np.arange(0, dims[1] + 1)
    sensor_pose = 10
    g_zz = np.arange(0, dims[2] + 1)

    # Obtaining the grid with coords...
    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
    coords_grid = coords_grid.astype(np.float)

    coords_grid = (coords_grid * resolution) + resolution / 2

    temp = np.copy(coords_grid)
    temp[:, 0] = coords_grid[:, 1]
    temp[:, 1] = coords_grid[:, 0]
    coords_grid = np.copy(temp)

    return coords_grid

def draw(
    # T_velo_2_cam,
    # vox_origin,
    # img_size,
    # f,
    d=7,  # 7m - determine the size of the mesh representing the camera
    # # Compute the coordinates of the mesh representing camera
    # x = d * img_size[0] / (2 * f)
    # y = d * img_size[1] / (2 * f)
    # tri_points = np.array(
    #     [
    #         [0, 0, 0],
    #         [x, y, d],
    #         [-x, y, d],
    #         [-x, -y, d],
    #         [x, -y, d],
    #     ]
    # )
    # tri_points = np.hstack([tri_points, np.ones((5, 1))])
    # tri_points = (np.linalg.inv(T_velo_2_cam) @ tri_points.T).T
    # x = tri_points[:, 0] - vox_origin[0]
    # y = tri_points[:, 1] - vox_origin[1]
    # z = tri_points[:, 2] - vox_origin[2]
    # triangles = [
    #     (0, 1, 2),
    #     (0, 1, 4),
    #     (0, 3, 4),
    #     (0, 2, 3),
    # ]

    fov_mask = fov_mask.reshape(-1)
    # Compute the voxels coordinates
    grid_coords = get_grid_coords(
        [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size

    # Attach the predicted class to every voxel
    grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T

    # Get the voxels inside FOV
    fov_grid_coords = grid_coords[fov_mask, :]

    # Get the voxels outside FOV
    outfov_grid_coords = grid_coords[~fov_mask, :]

    # Remove empty and unknown voxels
    fov_voxels = fov_grid_coords[
        (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255)
    outfov_voxels = outfov_grid_coords[
        (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255)

    figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))

    # # Draw the camera
    # mlab.triangular_mesh(
    #     x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5
    # )

    # Draw occupied inside FOV voxels
    plt_plot_fov = mlab.points3d(
        fov_voxels[:, 0],
        fov_voxels[:, 1],
        fov_voxels[:, 2],
        fov_voxels[:, 3],
        scale_factor=voxel_size - 0.05 * voxel_size,

    # Draw occupied outside FOV voxels
    plt_plot_outfov = mlab.points3d(
        outfov_voxels[:, 0],
        outfov_voxels[:, 1],
        outfov_voxels[:, 2],
        outfov_voxels[:, 3],
        scale_factor=voxel_size - 0.05 * voxel_size,

    colors = np.array(
            [100, 150, 245, 255],
            [100, 230, 245, 255],
            [30, 60, 150, 255],
            [80, 30, 180, 255],
            [100, 80, 250, 255],
            [255, 30, 30, 255],
            [255, 40, 200, 255],
            [150, 30, 90, 255],
            [255, 0, 255, 255],
            [255, 150, 255, 255],
            [75, 0, 75, 255],
            [175, 0, 75, 255],
            [255, 200, 0, 255],
            [255, 120, 50, 255],
            [0, 175, 0, 255],
            [135, 60, 0, 255],
            [150, 240, 80, 255],
            [255, 240, 150, 255],
            [255, 0, 0, 255],

    plt_plot_fov.glyph.scale_mode = "scale_by_vector"
    plt_plot_outfov.glyph.scale_mode = "scale_by_vector"

    plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors

    outfov_colors = colors
    outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
    plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors

def main(config: DictConfig):
    scan = config.file
    with open(scan, "rb") as handle:
        b = pickle.load(handle)

    fov_mask = b["fov_mask"]
    # T_velo_2_cam = b["T_velo_2_cam"]
    # vox_origin = np.array([0, -25.6, -2])

    pred = b["pred"]

        # T_velo_2_cam,
        # vox_origin,
        # img_size=(1220, 370),
        # f=707.0912,
        # d=7,

if __name__ == "__main__":
2108LEO commented 5 months ago


you should modify the according to the note. `data_loader = data_module.val_dataloader()

data_loader = data_module.test_dataloader() # use this if you want to infer on test set`