fabio-sim / LightGlue-ONNX

ONNX-compatible LightGlue: Local Feature Matching at Light Speed. Supports TensorRT, OpenVINO
Apache License 2.0
376 stars 34 forks source link

Lightglue produces wrong matches after TensorRT optimization #90

Closed PeppeFacoltativo closed 1 month ago

PeppeFacoltativo commented 2 months ago

Hello, thank you for this very useful repo!

I am facing issues with Lightglue TensorRT optimization: I converted superpoint_lightglue.trt.onnx to the TensorRT model using the command

trtexec --onnx=superpoint_lightglue_trt.onnx --saveEngine=superpoint_lightglue.trt --shapes=kpts0:1x1024x2,kpts1:1x1024x2,desc0:1x1024x256,desc1:1x1024x256

Then I used the produced model _superpointlightglue.trt to perform inference and something weird happened: if I feed Lightglue the same keypoints and descriptors for both the images in the matching, the results seem good (despite being 946 instead of the expected 1024)

immagine

On the contrary, if I try to use it in different images the matches seem completely different from those obtained with the respective onnx model (superpoint_lightglue.trt.onnx):

immagine

I leave here the code that I used to load the engine and compute the matches:

// Constructor to load and initialize the TensorRT engine and allocate memory
LightglueTensorRT::LightglueTensorRT(const std::string& engine_file_path, size_t num_keypoints)
    : num_keypoints(num_keypoints)
{
    std::cout << "Loading engine: " << engine_file_path << "..." << std::endl;

    // Load TensorRT engine from file
    std::ifstream engineFile(engine_file_path, std::ios::binary);
    if (!engineFile)
    {
        std::cerr << "Error opening engine file: " << engine_file_path << std::endl;
        throw std::runtime_error("Failed to load engine");
    }

    engineFile.seekg(0, engineFile.end);
    size_t engine_size = engineFile.tellg();
    engineFile.seekg(0, engineFile.beg);

    std::vector<char> engine_data(engine_size);
    engineFile.read(engine_data.data(), engine_size);
    engineFile.close();

    LoggerLG logger;
    runtime = nvinfer1::createInferRuntime(logger);
    engine = runtime->deserializeCudaEngine(engine_data.data(), engine_size);
    context = engine->createExecutionContext();

    // Create CUDA stream
    cudaStreamCreate(&stream);

    // Set tensor shapes if necessary (dynamic input)
    context->setInputShape("kpts0", nvinfer1::Dims3(1, num_keypoints, 2));
    context->setInputShape("kpts1", nvinfer1::Dims3(1, num_keypoints, 2));
    context->setInputShape("desc0", nvinfer1::Dims3(1, num_keypoints, 256));
    context->setInputShape("desc1", nvinfer1::Dims3(1, num_keypoints, 256));

    // Memory sizes
    kps_nbytes = num_keypoints * 2 * sizeof(float);
    desc_nbytes = num_keypoints * 256 * sizeof(float);
    matches_size = num_keypoints * 2 * sizeof(int64_t);
    scores_size = num_keypoints * sizeof(float);

    // Allocate input and output memory
    input_memory.push_back(safeCudaMalloc(kps_nbytes));  // keypoints0
    input_memory.push_back(safeCudaMalloc(kps_nbytes));  // keypoints1
    input_memory.push_back(safeCudaMalloc(desc_nbytes)); // descriptors0
    input_memory.push_back(safeCudaMalloc(desc_nbytes)); // descriptors1

    output_memory.push_back(safeCudaMalloc(matches_size)); // matches
    output_memory.push_back(safeCudaMalloc(scores_size));  // scores
}

// Perform inference and return matches
MatchOutput LightglueTensorRT::infer(const std::vector<cv::Point2f>& keypoints0, const std::vector<cv::Point2f>& keypoints1,
    const std::vector<float>& descriptors0, const std::vector<float>& descriptors1)
{
    float match_score_thresh = 0.0;

    // Preprocessing: copy input data to GPU
    cudaMemcpyAsync(input_memory[0], keypoints0.data(), keypoints0.size() * sizeof(cv::Point2f), cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(input_memory[1], keypoints1.data(), keypoints1.size() * sizeof(cv::Point2f), cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(input_memory[2], descriptors0.data(), descriptors0.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
    cudaMemcpyAsync(input_memory[3], descriptors1.data(), descriptors1.size() * sizeof(float), cudaMemcpyHostToDevice, stream);

    // Set input tensor addresses
    context->setTensorAddress("kpts0", input_memory[0]);
    context->setTensorAddress("kpts1", input_memory[1]);
    context->setTensorAddress("desc0", input_memory[2]);
    context->setTensorAddress("desc1", input_memory[3]);

    // Set output tensor addresses
    context->setTensorAddress("matches0", output_memory[0]);
    context->setTensorAddress("mscores0", output_memory[1]);

    // Synchronize the CUDA stream before inference
    cudaStreamSynchronize(stream);

    // Run inference
    if (!context->enqueueV3(stream))
    {
        std::cerr << "Failed to run inference with enqueueV3" << std::endl;
        throw std::runtime_error("Inference failed");
    }

    // Synchronize the stream after inference
    cudaStreamSynchronize(stream);

    // Copy the output data back to CPU
    std::vector<int64_t> matches(num_keypoints * 2);
    std::vector<float> scores(num_keypoints);

    cudaMemcpyAsync(matches.data(), output_memory[0], num_keypoints * 2 * sizeof(int64_t), cudaMemcpyDeviceToHost, stream);
    cudaMemcpyAsync(scores.data(), output_memory[1], num_keypoints * sizeof(float), cudaMemcpyDeviceToHost, stream);

    cudaStreamSynchronize(stream);

    // Postprocessing: filter out matches based on the threshold
    MatchOutput result;
    for (int i = 0; i < num_keypoints; ++i)
    {
        if (scores[i] > match_score_thresh)
        {
            result.matches.emplace_back(matches[i * 2], matches[i * 2 + 1]);
            result.scores.push_back(scores[i]);
        }
    }

    return result;
}

I leave here some details about my environment:

fabio-sim commented 2 months ago

Hi @PeppeFacoltativo, thank you for your interest in LightGlue-ONNX.

I see that you're using the older v1 models. Sorry I'm not an expert in the TensorRT C++ API, but I've added a working sample that uses the Python API (via Polygraphy) for pure TensorRT inference with the v2 LightGlue pipelines:

https://github.com/fabio-sim/LightGlue-ONNX/blob/a129c29c077112c03f1968f07a543952ece31251/dynamo.py#L201-L279

PeppeFacoltativo commented 2 months ago

Thank you for your answer, I will certainly try the v2 models. Anyways I was able to solve the issue: I was missing a preprocessing phase before inferring with LightGlue. I report the fixed code here as a reference.

    MatchOutput infer(const std::vector<cv::Point2f>& keypoints0, const std::vector<cv::Point2f>& keypoints1,
        const std::vector<float>& descriptors0, const std::vector<float>& descriptors1, const size_t& img_width, const size_t& img_height)
    {
        // Calculate size, shift, and scale
        std::vector<float> size = { static_cast<float>(img_width), static_cast<float>(img_height) };
        std::vector<float> shift = { size[0] / 2.0f, size[1] / 2.0f };
        float scale = std::max(size[0], size[1]) / 2.0f;

        std::vector<float> keypoints0_in(2 * num_keypoints);
        std::vector<float> keypoints1_in(2 * num_keypoints);
        // Normalize each keypoint
        for (size_t i = 0; i < num_keypoints; i++)
        {
            keypoints0_in[2 * i] = (keypoints0.at(i).x - shift[0]) / scale;  // Normalize x
            keypoints0_in[2 * i + 1] = (keypoints0.at(i).y - shift[1]) / scale;  // Normalize y

            keypoints1_in[2 * i] = (keypoints1.at(i).x - shift[0]) / scale;  // Normalize x
            keypoints1_in[2 * i + 1] = (keypoints1.at(i).y - shift[1]) / scale;  // Normalize y
        }

        // Preprocessing: copy input data to GPU
        cudaMemcpyAsync(input_memory[0], keypoints0_in.data(), keypoints0_in.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
        cudaMemcpyAsync(input_memory[1], keypoints1_in.data(), keypoints1_in.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
        cudaMemcpyAsync(input_memory[2], descriptors0.data(), descriptors0.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
        cudaMemcpyAsync(input_memory[3], descriptors1.data(), descriptors1.size() * sizeof(float), cudaMemcpyHostToDevice, stream);

        // Set input tensor addresses
        context->setTensorAddress("kpts0", input_memory[0]);
        context->setTensorAddress("kpts1", input_memory[1]);
        context->setTensorAddress("desc0", input_memory[2]);
        context->setTensorAddress("desc1", input_memory[3]);

        // Set output tensor addresses
        context->setTensorAddress("matches0", output_memory[0]);
        context->setTensorAddress("mscores0", output_memory[1]);

        // Synchronize the CUDA stream before inference
        cudaStreamSynchronize(stream);

        // Run inference
        if (!context->enqueueV3(stream))
        {
            std::cerr << "Failed to run inference with enqueueV3" << std::endl;
            throw std::runtime_error("Inference failed");
        }
    }