Trying to use the pose and image for ML

I found this repo through twitter and found it to be really nicely written and performant so I wanted to use it for 3d segmentation, using a method from a github repo called sam3d. So I combined parts of your repo with torchops with python.

I specifically used your way of loading in las data:

std::vector<at::Tensor> loadLasFile(std::vector<std::string> paths) {

    printfmt("hello\n");
    initCuda();
    printfmt("We get here\n");
    initCudaProgram();
    printfmt("We init cuda program\n");

    bool debug = false;
    auto cpu = getCpuData();
    int numThreads = 1;

    printfmt("cpu.numProcessors: {} \n", cpu.numProcessors);
    printfmt("launching {} loader threads \n", numThreads);

    pinnedMemPool.reserveSlots(PINNED_MEM_POOL_SIZE);
    printfmt("pinnedMemPool.pool.size(): {} \n", pinnedMemPool.pool.size());

    mtx_loader.reserve(numThreads);
    for(int i = 0; i < numThreads; i++){
        mtx_loader.push_back(make_unique<mutex>());
        spawnLoader(i);
    }

    spawnUploader();

    reload();

    readFiles(paths);

    while (!(batchStreamUploadIndex == numBatchesTotal)) {
        std::this_thread::sleep_for(10ms);
        printfmt("batchStreamUploadIndex: {} (total: {}) \n", batchStreamUploadIndex, numBatchesTotal);
    }
    // Create tensors to store data from Point arrays, for GPU 
    std::vector<at::Tensor> tensorsCUDA = createTensorsCUDA(targetCountVector);
    printfmt("tensors created \n");

    printfmt("resetting \n");
    freeCUDAMemory(targetCountVector);
    resetCUDA();
    //freeCUDAMemory(targetCountVector);

    return tensorsCUDA;

}

And generated a transform that is equivalent to the uniform.transform with:

std::tuple<at::Tensor, at::Tensor, at::Tensor> generate_world_view_projection(at::Tensor xyz, at::Tensor colors, double radius = 21.906,
                                             at::Tensor target = at::tensor({ 6.569, 17.436, -1.126, }),
                                             double yaw = 0.062, double pitch = -0.350) {
    // Create translation matrices and convert to float64
    at::Tensor translateRadius = at::eye(4, at::dtype(torch::kFloat64));
    translateRadius[2][3] = radius;

    std::cout << "TranslateRadius Tensor:\n" << translateRadius << std::endl;

    at::Tensor translateTarget = at::eye(4, at::dtype(torch::kFloat64));
    // print the translate target slice
    std::cout << "TranslateTarget Tensor Slice:\n" << translateTarget.index({3,Slice(None, 3)}) << std::endl;
    // print the target
    std::cout << "Target Tensor:\n" << target << std::endl;
    translateTarget.index_put_({3,Slice(None, 3)}, target);

    std::cout << "TranslateTarget Tensor:\n" << translateTarget << std::endl;

    // Create rotation matrices
    at::Tensor rotYaw = at::tensor({std::cos(-yaw), -std::sin(-yaw), 0.0, 0.0,
                                        std::sin(-yaw), std::cos(-yaw), 0.0, 0.0,
                                        0.0, 0.0, 1.0, 0.0,
                                        0.0, 0.0, 0.0, 1.0}).reshape({4, 4});

    // Print the rotYaw tensor
    std::cout << "RotYaw Tensor:\n" << rotYaw << std::endl;

    at::Tensor rotPitch = at::tensor({1.0, 0.0, 0.0, 0.0, 0.0, std::cos(-pitch), -std::sin(-pitch), 0.0, 0.0, std::sin(-pitch), std::cos(-pitch), 0.0, 0.0, 0.0, 0.0, 1.0}).reshape({4, 4});

    // Print the rotPitch tensor
    std::cout << "RotPitch Tensor:\n" << rotPitch << std::endl;

    at::Tensor flip = at::tensor({1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0}).reshape({4, 4});
    // Print the flip tensor
    std::cout << "Flip Tensor:\n" << flip << std::endl;

    // Calculate the world matrix
    //at::Tensor world = at::matmul(at::matmul(at::matmul(at::matmul(translateRadius, flip), rotPitch),
                                                     //rotYaw), translateTarget);
    // Print the shape of flip and translateRadius
    std::cout << "Flip Tensor Shape:\n" << flip.sizes() << std::endl;
    std::cout << "TranslateRadius Tensor Shape:\n" << translateRadius.sizes() << std::endl;
    auto interm1 = at::matmul(translateRadius, flip);
    std::cout << "Interm1 Tensor:\n" << interm1 << std::endl;
    auto interm2 = at::matmul(interm1, rotPitch);
    std::cout << "Interm2 Tensor:\n" << interm2 << std::endl;
    auto interm3 = at::matmul(interm2, rotYaw);
    std::cout << "Interm3 Tensor:\n" << interm3 << std::endl;
    auto interm4 = at::matmul(interm3, translateTarget);
    std::cout << "Interm4 Tensor:\n" << interm4 << std::endl;
    at::Tensor world = interm4;
    // Print the world tensor
    std::cout << "World Tensor:\n" << world << std::endl;

    at::Tensor view = at::inverse(world);

    // Print the view tensor
    std::cout << "View Tensor:\n" << view << std::endl;

    // Define constants
    double fovy = 60.0;
    double near = 0.1;
    double far = 2000000.0;

    // Create a projection matrix
    double aspect_ratio = 1.82;
    // Convert degrees to radians
    double fovy_rad = fovy * M_PI / 180.0;
    //double fovy_rad = at::deg2rad(at::Tensor(fovy)).item<double>();
    at::Tensor proj_matrix = at::tensor({
        1.0 / (aspect_ratio * std::tan(fovy_rad / 2)), 0.0, 0.0, 0.0,
        0.0, 1.0 / std::tan(fovy_rad / 2), 0.0, 0.0,
        0.0, 0.0, -(far + near) / (far - near), -2 * far * near / (far - near),
        0.0, 0.0, -1.0, 0.0
    }, at::dtype(torch::kFloat64)).reshape({4, 4});

    // Print the proj_matrix tensor
    std::cout << "Proj Matrix Tensor:\n" << proj_matrix << std::endl;

    at::Tensor world_view = at::matmul(view, proj_matrix);

    // Print the world_view tensor
    std::cout << "World View Tensor:\n" << world_view << std::endl;

    at::Tensor pointfloat = at::tensor({10.7, 9.21, 0.69, 1.00}, at::dtype(torch::kFloat64)).reshape({4, 1});
    at::Tensor ndc = at::matmul(at::transpose(world_view, 0, 1), pointfloat);

    std::cout << "NDC Tensor(unnormalized):\n" << ndc << std::endl;
    at::Tensor ndc_divided = ndc / ndc[3];  // Perform division without modifying ndc in-place
    ndc = ndc_divided;

    // Print the ndc tensor
    std::cout << "NDC Tensor:\n" << ndc << std::endl;

    at::Tensor world_view_tensor = world_view.to(torch::kCUDA);
    //at::Tensor xyz_hom = at::cat({xyz, at::ones({xyz.size(0), 1},xyz.device() ,at::dtype(torch::kFloat64))}, 1);
    // Make at::ones the same device as xyz
    at::Tensor ones_tensor = at::ones({xyz.size(0), 1}, at::dtype(torch::kFloat64));
    ones_tensor = ones_tensor.to(xyz.device());

    at::Tensor xyz_hom = at::cat({xyz, ones_tensor}, 1);
    // Print the xyz_hom tensor

    at::Tensor ndc_tensor = at::matmul(world_view_tensor.transpose(0, 1), xyz_hom.transpose(0, 1));

    //std::cout << "NDC Tensor(unnormalized):\n" << ndc_tensor << std::endl;

    at::Tensor depth = ndc_tensor[3];
    at::Tensor ndc_tensor_divided = ndc_tensor / ndc_tensor[3];
    ndc_tensor = ndc_tensor_divided.transpose(0, 1);
    //ndc_tensor = ndc_tensor.transpose(0, 1);
    std::cout << "NDC Tensor:\n" << ndc_tensor.index({Slice(0, 10)}) << std::endl;

    // Image dimensions (width and height)
    int image_width = 3740;
    int image_height = 2060;

    // Calculate x and y using tensor operations
    at::Tensor x = ((ndc_tensor.select(1, 0) * 0.5 + 0.5) * image_width).to(torch::kInt64);
    at::Tensor y = ((ndc_tensor.select(1, 1) * 0.5 + 0.5) * image_height).to(torch::kInt64);

    // Print the x and y tensors
    std::cout << "x Tensor:\n" << x.index({Slice(0, 10)}) << std::endl;
    std::cout << "y Tensor:\n" << y.index({Slice(0, 10)}) << std::endl;

    // Rest of the code
    at::Tensor valid_coordinates = (x > 1) & (x < (image_width - 2)) & (y > 1) & (y < (image_height - 2));

    std::cout << "valid_coordinates Tensor:\n" << valid_coordinates.index({Slice(0, 10)}) << std::endl;

    x = x.masked_select(valid_coordinates);
    y = y.masked_select(valid_coordinates);
    depth = depth.masked_select(valid_coordinates);
    colors = colors.index_select(0, valid_coordinates.nonzero().view(-1));

    std::cout << "x Tensor:\n" << x.index({Slice(0, 10)}) << std::endl;
    std::cout << "y Tensor:\n" << y.index({Slice(0, 10)}) << std::endl;
    std::cout << "depth Tensor:\n" << depth.index({Slice(0, 10)}) << std::endl;
    std::cout << "colors Tensor:\n" << colors.index({Slice(0, 10)}) << std::endl;

    at::Tensor valid_indices = at::arange(xyz.size(0), torch::kInt32).to(at::kCUDA).index_select(0, valid_coordinates.nonzero().view(-1));
    std::cout << "valid_indices Tensor:\n" << valid_indices.index({Slice(0, 10)}) << std::endl;

    // Create xydepth tensor
    // Convert x and y to double tensors
    at::Tensor x_float = x.unsqueeze(1).to(torch::kFloat32 ).to(x.device());
    at::Tensor y_float = y.unsqueeze(1).to(torch::kFloat32 ).to(y.device());
    at::Tensor depth_float = depth.unsqueeze(1).to(torch::kFloat32).to(depth.device());

    std::cout << "x_float Tensor:\n" << x_float.index({Slice(0, 10)}) << std::endl;

    // Check and print scalar types
    std::cout << "x_float Scalar Type: " << x_float.scalar_type() << std::endl;
    std::cout << "y_float Scalar Type: " << y_float.scalar_type() << std::endl;
    std::cout << "depth_float Scalar Type: " << depth_float.scalar_type() << std::endl;

    // Check and print devices
    std::cout << "x_float Device: " << x_float.device() << std::endl;
    std::cout << "y_float Device: " << y_float.device() << std::endl;
    std::cout << "depth_float Device: " << depth_float.device() << std::endl;

    // Check and print shapes
    std::cout << "x_float Shape: " << x_float.sizes() << std::endl;
    std::cout << "y_float Shape: " << y_float.sizes() << std::endl;
    std::cout << "depth_float Shape: " << depth_float.sizes() << std::endl;

    // Check and print types
    std::cout << "x_float Scalar Type: " << x_float.scalar_type() << std::endl;
    std::cout << "y_float Scalar Type: " << y_float.scalar_type() << std::endl;
    std::cout << "depth_float Scalar Type: " << depth_float.scalar_type() << std::endl;

    // Concatenate x, y, and depth tensors
    at::Tensor xydepth = at::cat({x_float, y_float, depth_float}, 1);
    //torch::Tensor xydepth = torch::cat({x.unsqueeze(1).to(torch::kFloat64), y.unsqueeze(1).to(torch::kFloat64), depth.unsqueeze(1)}, 1);
    std::cout << "xydepth Tensor:\n" << xydepth.index({Slice(0, 10)}) << std::endl;

    return std::make_tuple(xydepth, colors, valid_indices);

}

This code also creates the ndc tensor in a similar way as draw_points in your kernel(which I changed for checking if uniform.transform of mine is equivalent to yours):

void drawPoint(Point point, Node* node, uint64_t* framebuffer, bool* deviceDebug, MyData* DeviceNdc, MyData* DevicePointFloat4, MyMat4* DeviceTransform){
    float4 ndc = uniforms.transform * float4{point.x, point.y, point.z, 1.0f};
    float depth = ndc.w;
    ndc = ndc / ndc.w;

    int x = (ndc.x * 0.5 + 0.5) * uniforms.width;
    int y = (ndc.y * 0.5 + 0.5) * uniforms.height;

    if(x > 1 && x < uniforms.width  - 2.0)
    if(y > 1 && y < uniforms.height - 2.0)
    {

        uint32_t color = point.color;
        if(uniforms.colorByNode){
            color = (node->getID() % 127) * 123456789ull;
        }else if(uniforms.colorByLOD){
            color = getLodColor(node->level);
        }

        // float w = y - 3.0 * x;
        // if(w > -3800.0){
        //  color = point.color;
        // }else{
        //  color = (node->getID() % 127) * 123456789ull;
        //  color = getLodColor(node->level);
        // }

        for(int ox = 0; ox < uniforms.pointSize; ox++)
        for(int oy = 0; oy < uniforms.pointSize; oy++)
        {
            uint32_t px = clamp(x + ox, 0, int(uniforms.width));
            uint32_t py = clamp(y + oy, 0, int(uniforms.height));

            uint32_t pixelID = px + int(uniforms.width) * py;
            uint64_t udepth = *((uint32_t*)&depth);
            uint64_t encoded = (udepth << 32) | color;

            if(encoded < framebuffer[pixelID]){
                atomicMin(&framebuffer[pixelID], encoded);
            }
        }

    }

    if (*deviceDebug == true){

        int threadIndex = blockIdx.x * blockDim.x + threadIdx.x;
        if (threadIndex < 10) {
            DeviceNdc[threadIndex].x = ndc.x;
            DeviceNdc[threadIndex].y = ndc.y;
            DeviceNdc[threadIndex].z = ndc.z;
            DeviceNdc[threadIndex].w = ndc.w;
            //cudaMemcpy(&hostndc[threadIndex], &ndc, sizeof(MyData), cudaMemcpyDeviceToHost);
            //cudaMemcpy(&hosttransform[threadIndex], &uniforms.transform, sizeof(MyMat4), cudaMemcpyDeviceToHost);
            // zeroth row
            DeviceTransform[threadIndex].rows[0].x = uniforms.transform.rows[0].x;
            DeviceTransform[threadIndex].rows[0].y = uniforms.transform.rows[0].y;
            DeviceTransform[threadIndex].rows[0].z = uniforms.transform.rows[0].z;
            DeviceTransform[threadIndex].rows[0].w = uniforms.transform.rows[0].w;
            // first row
            DeviceTransform[threadIndex].rows[1].x = uniforms.transform.rows[1].x;
            DeviceTransform[threadIndex].rows[1].y = uniforms.transform.rows[1].y;
            DeviceTransform[threadIndex].rows[1].z = uniforms.transform.rows[1].z;
            DeviceTransform[threadIndex].rows[1].w = uniforms.transform.rows[1].w;
            // second row
            DeviceTransform[threadIndex].rows[2].x = uniforms.transform.rows[2].x;
            DeviceTransform[threadIndex].rows[2].y = uniforms.transform.rows[2].y;
            DeviceTransform[threadIndex].rows[2].z = uniforms.transform.rows[2].z;
            DeviceTransform[threadIndex].rows[2].w = uniforms.transform.rows[2].w;
            // third row
            DeviceTransform[threadIndex].rows[3].x = uniforms.transform.rows[3].x;
            DeviceTransform[threadIndex].rows[3].y = uniforms.transform.rows[3].y;
            DeviceTransform[threadIndex].rows[3].z = uniforms.transform.rows[3].z;
            DeviceTransform[threadIndex].rows[3].w = uniforms.transform.rows[3].w;

            DevicePointFloat4[threadIndex].x = point.x;
            DevicePointFloat4[threadIndex].y = point.y;
            DevicePointFloat4[threadIndex].z = point.z;
            DevicePointFloat4[threadIndex].w = 1.0f;

            // MyData pointFloat4 = {point.x, point.y, point.z, 1.0f};
            // cudaMemcpy(&hostpointFloat4[threadIndex], &pointFloat4, sizeof(MyData), cudaMemcpyDeviceToHost);
        }

    }
}

However I get: afbeelding But your SimLOD gives: captured_image_05

Is there some transform that is being done on the points that I am missing? I really like the fact that your software is so fast and is self contained, to the extent that is possible of course. So I'd really like to use it, but this really stumped me. And since your the developer I thought maybe you know what I am doing wrong.

m-schuetz / SimLOD

Trying to use the pose and image for ML #3