xuanandsix / CLRNet-onnxruntime-and-tensorrt-demo

This is the onnxruntime and tensorrt inference code for CLRNet: Cross Layer Refinement Network for Lane Detection (CVPR 2022). Official code: https://github.com/hongyliu/CLRNet
54 stars 6 forks source link

c++ inference #15

Open interstellar-space opened 3 months ago

interstellar-space commented 3 months ago

I wrote some C++ code, but the inference results are different from Python. Can you help me take a look?

#include <iostream>
#include <fstream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <NvInfer.h>
#include <cuda_runtime_api.h>
#include <NvInferRuntimeCommon.h>
#include <algorithm>
#include <cmath>
#include <numeric>
#include <Eigen/Dense>
#include <unsupported/Eigen/Splines>

using namespace nvinfer1;

const std::vector<cv::Scalar> COLORS = {
    cv::Scalar(255, 0, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255),
    cv::Scalar(255, 255, 0), cv::Scalar(255, 0, 255), cv::Scalar(0, 255, 255),
    cv::Scalar(128, 255, 0), cv::Scalar(255, 128, 0), cv::Scalar(128, 0, 255),
    cv::Scalar(255, 0, 128), cv::Scalar(0, 128, 255), cv::Scalar(0, 255, 128),
    cv::Scalar(128, 255, 255), cv::Scalar(255, 128, 255), cv::Scalar(255, 255, 128),
    cv::Scalar(60, 180, 0), cv::Scalar(180, 60, 0), cv::Scalar(0, 60, 180),
    cv::Scalar(0, 180, 60), cv::Scalar(60, 0, 180), cv::Scalar(180, 0, 60)};

class Lane
{
public:
    Lane(const std::vector<cv::Point2f> &points, float invalid_value = -2.0f)
        : points(points), invalid_value(invalid_value)
    {
        // Initialize spline interpolation using Eigen
        Eigen::VectorXd x(points.size()), y(points.size());
        for (size_t i = 0; i < points.size(); ++i)
        {
            x[i] = points[i].y;
            y[i] = points[i].x;
        }
        spline = Eigen::SplineFitting<Eigen::Spline<double, 1>>::Interpolate(y.transpose(), std::min<int>(3, points.size() - 1), x);
        min_y = x.minCoeff() - 0.01;
        max_y = x.maxCoeff() + 0.01;
    }

    std::vector<cv::Point2f> to_array() const
    {
        std::vector<cv::Point2f> lane;
        for (int y = 710; y >= 150; y -= 10)
        {
            double x = spline(y)(0);
            if (x >= 0 && x < 1)
            {
                lane.emplace_back(x * 1280, y);
            }
        }
        return lane;
    }

private:
    std::vector<cv::Point2f> points;
    float invalid_value;
    Eigen::Spline<double, 1> spline;
    double min_y, max_y;
};

class Logger : public nvinfer1::ILogger
{
    void log(Severity severity, const char *msg) noexcept override
    {
        if (severity <= Severity::kINFO)
        {
            std::cerr << msg << std::endl;
        }
    }
};

class CLRNetDemo
{
public:
    CLRNetDemo(const std::string &engine_path)
    {
        // Load TensorRT engine
        std::ifstream engine_file(engine_path, std::ios::binary);
        std::vector<char> engine_data((std::istreambuf_iterator<char>(engine_file)), std::istreambuf_iterator<char>());
        runtime = createInferRuntime(logger);
        engine = runtime->deserializeCudaEngine(engine_data.data(), engine_data.size());
        context = engine->createExecutionContext();

        // Initialize input and output bindings
        for (int i = 0; i < engine->getNbBindings(); ++i)
        {
            if (engine->bindingIsInput(i))
            {
                input_binding = i;
            }
            else
            {
                output_binding = i;
            }
        }

        // Allocate memory for input and output
        auto input_dims = engine->getBindingDimensions(input_binding);
        auto output_dims = engine->getBindingDimensions(output_binding);
        size_t input_size = 1;
        size_t output_size = 1;
        for (int i = 0; i < input_dims.nbDims; ++i)
        {
            input_size *= input_dims.d[i];
        }
        for (int i = 0; i < output_dims.nbDims; ++i)
        {
            output_size *= output_dims.d[i];
        }
        cudaMalloc(&buffers[input_binding], input_size * sizeof(float));
        cudaMalloc(&buffers[output_binding], output_size * sizeof(float));
        cudaStreamCreate(&stream);
    }

    ~CLRNetDemo()
    {
        cudaFree(buffers[input_binding]);
        cudaFree(buffers[output_binding]);
        cudaStreamDestroy(stream);
        context->destroy();
        engine->destroy();
        runtime->destroy();
    }

    cv::Mat forward(const cv::Mat &img)
    {
        // Preprocess input image
        cv::Mat input_img = img(cv::Rect(0, 160, img.cols, img.rows - 160));
        cv::resize(input_img, input_img, cv::Size(800, 320), cv::INTER_CUBIC);
        input_img.convertTo(input_img, CV_32FC3, 1.0 / 255.0);

        // Transpose the image to match the model input
        cv::Mat input_img_transposed;
        cv::dnn::blobFromImage(input_img, input_img_transposed);

        // Allocate memory for input and output
        std::vector<float> input_data(input_img_transposed.total() * input_img_transposed.channels());
        std::memcpy(input_data.data(), input_img_transposed.data, input_data.size() * sizeof(float));

        auto output_dims = engine->getBindingDimensions(output_binding);
        size_t output_size = 1;
        for (int i = 0; i < output_dims.nbDims; ++i)
        {
            output_size *= output_dims.d[i];
        }
        std::vector<float> output_data(output_size);

        // Execute inference
        cudaMemcpyAsync(buffers[input_binding], input_data.data(), input_data.size() * sizeof(float), cudaMemcpyHostToDevice, stream);
        context->enqueueV2(buffers, stream, nullptr);
        cudaMemcpyAsync(output_data.data(), buffers[output_binding], output_data.size() * sizeof(float), cudaMemcpyDeviceToHost, stream);
        cudaStreamSynchronize(stream);

        // Postprocess output
        auto lanes = get_lanes(output_data);
        return imshow_lanes(img, lanes);
    }

private:
    IRuntime *runtime;
    ICudaEngine *engine;
    IExecutionContext *context;
    int input_binding, output_binding;
    void *buffers[2];
    cudaStream_t stream;
    Logger logger;

    std::vector<Lane> get_lanes(const std::vector<float> &output)
    {
        std::vector<Lane> decoded;
        std::vector<std::vector<float>> predictions(output.size() / 78, std::vector<float>(78));

        for (size_t i = 0; i < predictions.size(); ++i)
        {
            std::copy(output.begin() + i * 78, output.begin() + (i + 1) * 78, predictions[i].begin());
        }

        for (auto &prediction : predictions)
        {
            std::vector<float> scores = softmax({prediction[0], prediction[1]});
            std::cout << "scores: " << scores[0] << ", " << scores[1] << "\n";
            if (scores[1] < 0.4)
            {
                continue;
            }

            std::vector<std::vector<float>> nms_predictions;
            for (size_t i = 0; i < prediction.size(); ++i)
            {
                if (i < 4 || i >= 5)
                {
                    nms_predictions.push_back(prediction);
                }
            }
            std::cout << "nms_predictions: " << nms_predictions.size() << "\n";

            for (auto &nms_prediction : nms_predictions)
            {
                nms_prediction[4] *= 71;
                for (size_t i = 5; i < nms_prediction.size(); ++i)
                {
                    nms_prediction[i] *= 1279;
                }
            }

            auto keep = Lane_nms(nms_predictions, scores, 50, 5);
            std::vector<std::vector<float>> filtered_predictions;
            for (auto idx : keep)
            {
                filtered_predictions.push_back(predictions[idx]);
            }

            for (auto &filtered_prediction : filtered_predictions)
            {
                filtered_prediction[5] = std::round(filtered_prediction[5] * 71);
            }
            std::cout << "filtered_predictions: " << filtered_predictions.size() << "\n";

            auto pred = predictions_to_pred(filtered_predictions);
            decoded.insert(decoded.end(), pred.begin(), pred.end());
        }
        return decoded;
    }

    cv::Mat imshow_lanes(const cv::Mat &img, const std::vector<Lane> &lanes)
    {
        cv::Mat output_img = img.clone();
        for (size_t i = 0; i < lanes.size(); ++i)
        {
            auto lane_points = lanes[i].to_array();
            for (const auto &point : lane_points)
            {
                if (point.x > 0 && point.y > 0)
                {
                    cv::circle(output_img, point, 5, COLORS[i % COLORS.size()], -1);
                }
            }

            for (size_t j = 1; j < lane_points.size(); ++j)
            {
                if (lane_points[j - 1].x > 0 && lane_points[j - 1].y > 0 && lane_points[j].x > 0 && lane_points[j].y > 0)
                {
                    cv::line(output_img, lane_points[j - 1], lane_points[j], COLORS[i % COLORS.size()], 4);
                }
            }
        }
        return output_img;
    }

    std::vector<float> softmax(const std::vector<float> &x)
    {
        std::vector<float> y(x.size());
        float max_val = *std::max_element(x.begin(), x.end());
        float sum = 0.0f;
        for (size_t i = 0; i < x.size(); ++i)
        {
            y[i] = std::exp(x[i] - max_val);
            sum += y[i];
        }
        for (size_t i = 0; i < x.size(); ++i)
        {
            y[i] /= sum;
        }
        return y;
    }

    bool Lane_IOU(const std::vector<float> &parent_box, const std::vector<float> &compared_box, float threshold)
    {
        int n_offsets = 72;
        int n_strips = n_offsets - 1;

        int start_a = static_cast<int>(parent_box[2] * n_strips + 0.5);
        int start_b = static_cast<int>(compared_box[2] * n_strips + 0.5);
        int start = std::max(start_a, start_b);
        int end_a = start_a + static_cast<int>(parent_box[4] - 1 + 0.5 - ((parent_box[4] - 1) < 0));
        int end_b = start_b + static_cast<int>(compared_box[4] - 1 + 0.5 - ((compared_box[4] - 1) < 0));
        int end = std::min({end_a, end_b, 71});
        if ((end - start) < 0)
        {
            return false;
        }
        float dist = 0.0f;
        for (int i = 5 + start; i <= 5 + end; ++i)
        {
            if (parent_box[i] < compared_box[i])
            {
                dist += compared_box[i] - parent_box[i];
            }
            else
            {
                dist += parent_box[i] - compared_box[i];
            }
        }
        return dist < (threshold * (end - start + 1));
    }

    std::vector<int> Lane_nms(const std::vector<std::vector<float>> &proposals, const std::vector<float> &scores, float overlap, int top_k)
    {
        std::vector<int> keep_index;
        std::vector<int> indices(scores.size());
        std::iota(indices.begin(), indices.end(), 0);
        std::sort(indices.begin(), indices.end(), [&scores](int a, int b)
                  { return scores[a] > scores[b]; });

        std::vector<int> r_filters(scores.size(), 0);

        for (size_t i = 0; i < indices.size(); ++i)
        {
            if (r_filters[i] == 1)
            {
                continue;
            }
            keep_index.push_back(indices[i]);
            if (keep_index.size() > static_cast<size_t>(top_k))
            {
                break;
            }
            if (i == indices.size() - 1)
            {
                break;
            }
            for (size_t j = i + 1; j < indices.size(); ++j)
            {
                if (Lane_IOU(proposals[indices[i]], proposals[indices[j]], overlap))
                {
                    r_filters[j] = 1;
                }
            }
        }
        return keep_index;
    }

    std::vector<Lane> predictions_to_pred(const std::vector<std::vector<float>> &predictions)
    {
        std::vector<Lane> lanes;
        for (const auto &lane : predictions)
        {
            std::vector<float> lane_xs(lane.begin() + 6, lane.end());
            int start = std::min(std::max(0, static_cast<int>(std::round(lane[2] * 71))), 71);
            int length = static_cast<int>(std::round(lane[5]));
            int end = start + length - 1;
            end = std::min(end, 71);

            std::vector<bool> mask(start, false);
            for (int i = 0; i < start; ++i)
            {
                if (lane_xs[i] >= 0 && lane_xs[i] <= 1)
                {
                    mask[i] = true;
                }
            }

            for (int i = 0; i < start; ++i)
            {
                if (!mask[i])
                {
                    lane_xs[i] = -2;
                }
            }

            for (int i = end + 1; i < lane_xs.size(); ++i)
            {
                lane_xs[i] = -2;
            }

            std::vector<float> lane_ys;
            for (int i = 0; i < lane_xs.size(); ++i)
            {
                if (lane_xs[i] >= 0)
                {
                    lane_ys.push_back(1.0f - static_cast<float>(i) / 71.0f);
                }
            }

            std::vector<cv::Point2f> points;
            for (int i = 0; i < lane_xs.size(); ++i)
            {
                if (lane_xs[i] >= 0)
                {
                    points.emplace_back(lane_xs[i] * 1280, lane_ys[i] * (720 - 160) + 160);
                }
            }

            if (points.size() > 1)
            {
                lanes.emplace_back(points);
            }
        }
        return lanes;
    }
};

int main(int argc, char *argv[])
{
    if (argc != 3)
    {
        std::cout << argv[0] << ": <engine> <image>" << std::endl;
        return 0;
    }

    CLRNetDemo isnet(argv[1]);
    cv::Mat image = cv::imread(argv[2]);
    if (image.empty())
    {
        std::cerr << "Error: Could not open or find the image!" << std::endl;
        return -1;
    }
    cv::Mat output = isnet.forward(image);
    cv::imwrite("output_trt.png", output);
    return 0;
}
xjock commented 3 months ago

solved?

interstellar-space commented 3 months ago

@xjock no... Here is my code, but it cannot draw any lane lines

#include <iostream>
#include <fstream>
#include <NvInfer.h>
#include <memory>
#include <NvOnnxParser.h>
#include <vector>
#include <cuda_runtime_api.h>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/core.hpp>
#include <opencv2/cudaarithm.hpp>
#include <algorithm>
#include <numeric>

struct Detection
{
    float background;
    float foreground;
    float start_y;
    float start_x;
    float theta;
    float length;
    float lane_x_coordinates[72];
};

// utilities ----------------------------------------------------------------------------------------------------------
// class to log errors, warnings, and other information during the build and inference phases
class Logger : public nvinfer1::ILogger
{
public:
    void log(Severity severity, const char *msg) noexcept override
    {
        // remove this 'if' if you need more logged info
        if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR))
        {
            std::cout << msg << "\n";
        }
    }
} gLogger;

// destroy TensorRT objects if something goes wrong
struct TRTDestroy
{
    template <class T>
    void operator()(T *obj) const
    {
        if (obj)
        {
            obj->destroy();
        }
    }
};

template <class T>
using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;

// calculate size of tensor
size_t getSizeByDim(const nvinfer1::Dims &dims)
{
    size_t size = 1;
    for (size_t i = 0; i < dims.nbDims; ++i)
    {
        size *= dims.d[i];
    }
    return size;
}

// preprocessing stage ------------------------------------------------------------------------------------------------
void preprocessImage(cv::cuda::GpuMat &gpu_frame, float *gpu_input, const nvinfer1::Dims &dims)
{
    auto input_width = dims.d[2];
    auto input_height = dims.d[1];
    auto channels = dims.d[0];
    auto input_size = cv::Size(input_width, input_height);
    // resize
    cv::cuda::GpuMat resized;
    cv::cuda::resize(gpu_frame, resized, input_size, 0, 0, cv::INTER_NEAREST);
    // normalize
    cv::cuda::GpuMat flt_image;
    resized.convertTo(flt_image, CV_32FC3, 1.f / 255.f);
    cv::cuda::subtract(flt_image, cv::Scalar(0.485f, 0.456f, 0.406f), flt_image, cv::noArray(), -1);
    cv::cuda::divide(flt_image, cv::Scalar(0.229f, 0.224f, 0.225f), flt_image, 1, -1);
    // to tensor
    std::vector<cv::cuda::GpuMat> chw;
    for (size_t i = 0; i < channels; ++i)
    {
        chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
    }
    cv::cuda::split(flt_image, chw);
}

std::vector<std::vector<float>> softmax(const std::vector<std::vector<float>> &x)
{
    std::vector<std::vector<float>> y(x.size(), std::vector<float>(x[0].size()));

    // Assume that the softmax is performed along the last axis (columns).
    for (size_t i = 0; i < x.size(); ++i)
    {
        float maxVal = *std::max_element(x[i].begin(), x[i].end());

        std::vector<float> expVec(x[i].size());
        float sum = 0.0f;
        for (size_t j = 0; j < x[i].size(); ++j)
        {
            expVec[j] = exp(x[i][j] - maxVal);
            sum += expVec[j];
        }

        for (size_t j = 0; j < y[i].size(); ++j)
        {
            y[i][j] = expVec[j] / sum;
        }
    }

    return y;
}

bool Lane_IOU(const std::vector<float> &parent_box, const std::vector<float> &compared_box, float threshold)
{
    int n_offsets = 72;
    int n_strips = n_offsets - 1;

    int start_a = static_cast<int>(parent_box[2] * n_strips + 0.5);
    int start_b = static_cast<int>(compared_box[2] * n_strips + 0.5);
    int start = std::max(start_a, start_b);
    int end_a = start_a + static_cast<int>(parent_box[4] - 1 + 0.5 - ((parent_box[4] - 1) < 0));
    int end_b = start_b + static_cast<int>(compared_box[4] - 1 + 0.5 - ((compared_box[4] - 1) < 0));
    int end = std::min({end_a, end_b, 71});
    if ((end - start) < 0)
    {
        return false;
    }
    float dist = 0.0f;
    for (int i = 5 + start; i <= 5 + end; ++i)
    {
        if (parent_box[i] < compared_box[i])
        {
            dist += compared_box[i] - parent_box[i];
        }
        else
        {
            dist += parent_box[i] - compared_box[i];
        }
    }
    return dist < (threshold * (end - start + 1));
}

std::vector<int> Lane_nms(const std::vector<std::vector<float>> &proposals, const std::vector<float> &scores, float overlap = 50, int top_k = 4)
{
    std::vector<int> keep_index;
    std::vector<size_t> indices(scores.size());
    std::iota(indices.begin(), indices.end(), 0);

    // Sort indices based on corresponding scores in descending order
    std::sort(indices.begin(), indices.end(), [&scores](int a, int b)
              { return scores[a] > scores[b]; });

    std::vector<int> r_filters(scores.size(), 0);

    for (size_t i = 0; i < indices.size(); ++i)
    {
        size_t index = indices[i];
        if (r_filters[index] == 1) // Ensure we check r_filters for the right index
        {
            continue;
        }

        keep_index.push_back(index);

        if (static_cast<int>(keep_index.size()) > top_k) // We cast size to int to compare with top_k
        {
            break;
        }

        if (i == indices.size() - 1) // If it's the last index, break out of the loop
        {
            break;
        }

        // Iterate over the rest of the proposals from this point on
        for (size_t j = i + 1; j < indices.size(); ++j)
        {
            size_t sub_index = indices[j];
            if (!r_filters[sub_index]) // Check if not already filtered
            {
                if (Lane_IOU(proposals[index], proposals[sub_index], overlap))
                {
                    r_filters[sub_index] = 1;
                }
            }
        }
    }

    // Resize to remove any excess elements in case fewer than top_k were kept
    keep_index.resize(std::min(top_k, static_cast<int>(keep_index.size())));

    return keep_index;
}
std::vector<std::vector<cv::Point2f>> predictions_to_pred(const std::vector<std::vector<float>> &predictions,
                                                          const std::vector<float> &prior_ys,
                                                          int n_strips, int ori_img_w, int ori_img_h, int img_w, int img_h, int cut_height)
{
    std::vector<std::vector<cv::Point2f>> lanes;

    for (const auto &lane : predictions)
    {
        std::vector<float> lane_xs(lane.begin() + 6, lane.end()); // normalized value
        int start = std::min(std::max(0, static_cast<int>(round(lane[2] * n_strips))), n_strips);
        int length = static_cast<int>(round(lane[5]));
        int end = start + length - 1;
        end = std::min(end, static_cast<int>(prior_ys.size()) - 1);

        // Extend prediction until x is outside the image
        std::vector<bool> mask(start, false);
        for (int i = start - 1; i >= 0; --i)
        {
            if (lane_xs[i] < 0.0f || lane_xs[i] > 1.0f)
            {
                mask[i] = true;
            }
            else if (i < start - 1 && mask[i + 1])
            {
                mask[i] = true;
            }
        }

        std::fill(lane_xs.begin() + end + 1, lane_xs.end(), -2.0f);
        for (int i = 0; i < start; ++i)
        {
            if (mask[i])
            {
                lane_xs[i] = -2.0f;
            }
        }

        std::vector<float> lane_ys;
        for (size_t i = 0; i < lane_xs.size(); ++i)
        {
            if (lane_xs[i] >= 0)
            {
                lane_ys.push_back(prior_ys[i]);
            }
        }

        lane_xs.erase(std::remove_if(lane_xs.begin(), lane_xs.end(),
                                     [](float x)
                                     { return x < 0; }),
                      lane_xs.end());

        if (lane_xs.size() <= 1)
        {
            continue;
        }

        std::reverse(lane_xs.begin(), lane_xs.end());
        std::reverse(lane_ys.begin(), lane_ys.end());

        auto scale_x = static_cast<float>(ori_img_w) / img_w;
        auto scale_y = static_cast<float>(ori_img_h) / (img_h - cut_height);

        for (size_t i = 0; i < lane_xs.size(); ++i)
        {
            lane_xs[i] = lane_xs[i] * scale_x;
            lane_ys[i] = (lane_ys[i] * (img_h - cut_height) + cut_height) * scale_y;
        }

        std::vector<cv::Point2f> points;
        for (size_t i = 0; i < lane_xs.size(); ++i)
        {
            points.emplace_back(lane_xs[i], lane_ys[i]);
        }

        std::cout << "lane_xs: ";
        for (const auto &x : lane_xs)
        {
            std::cout << x << " ";
        }
        std::cout << "\nlane_ys: ";
        for (const auto &y : lane_ys)
        {
            std::cout << y << " ";
        }
        std::cout << "\n";

        lanes.push_back(points);
    }

    std::cout << "lanes: " << lanes.size() << "\n";

    return lanes;
}

cv::Mat imshow_lanes(cv::Mat &img, const std::vector<std::vector<cv::Point2f>> &lanes, int width = 4)
{
    std::vector<std::vector<cv::Point>> lanes_xys;

    for (const auto &lane : lanes)
    {
        std::vector<cv::Point> xys;
        for (const auto &point : lane)
        {
            if (point.x <= 0.0f || point.y <= 0.0f)
            {
                continue;
            }
            int x = static_cast<int>(point.x);
            int y = static_cast<int>(point.y);
            xys.emplace_back(x, y);
        }
        if (!xys.empty())
        {
            lanes_xys.push_back(xys);
        }
    }
    std::cout << "lanes_xys: " << lanes_xys.size() << "\n";

    std::sort(lanes_xys.begin(), lanes_xys.end(),
              [](const std::vector<cv::Point> &a, const std::vector<cv::Point> &b)
              { return a[0].x < b[0].x; });

    std::vector<cv::Scalar> COLORS = {cv::Scalar(255, 0, 0), cv::Scalar(0, 255, 0), cv::Scalar(0, 0, 255), cv::Scalar(255, 255, 0), cv::Scalar(0, 255, 255)};

    for (size_t idx = 0; idx < lanes_xys.size(); ++idx)
    {
        const auto &xys = lanes_xys[idx];
        for (size_t i = 1; i < xys.size(); ++i)
        {
            cv::line(img, xys[i - 1], xys[i], COLORS[idx % COLORS.size()], width);
        }
    }

    cv::imwrite("test.jpg", img);
    // cv::imshow("Lanes", img);
    // cv::waitKey(0);

    return img;
}

// post-processing stage ----------------------------------------------------------------------------------------------
void postprocessResults(cv::Mat &frame, float *gpu_output, const nvinfer1::Dims &dims, int batch_size)
{
    float conf_threshold = 0.4f;

    // copy results from GPU to CPU
    std::vector<float> cpu_output(getSizeByDim(dims) * batch_size);
    cudaMemcpy(cpu_output.data(), gpu_output, cpu_output.size() * sizeof(float), cudaMemcpyDeviceToHost);
    for (int i = 0; i < cpu_output.size(); ++i)
    {
        std::cout << cpu_output[i] << " ";
        if (!((i + 1) % 78))
        {
            std::cout << "\n\n";
        }
    }
    std::cout << "------------------------\n";
    std::vector<std::vector<float>> detections(cpu_output.size() / 78, std::vector<float>(2));
    for (size_t i = 0; i < cpu_output.size() / 78; ++i)
    {
        detections[i][0] = cpu_output[i * 78];
        detections[i][1] = cpu_output[(i + 1) * 78];
    }
    const auto xyscores = softmax(detections);
    for (const auto &score : xyscores)
    {
        for (const auto &val : score)
        {
            std::cout << val << " ";
        }
        std::cout << "\n";
    }
    std::cout << "------------------------\n";
    std::vector<float> scores;
    std::vector<std::vector<float>> predictions(cpu_output.size() / 78, std::vector<float>(78));
    for (int i = 0; i < xyscores.size(); ++i)
    {
        scores.emplace_back(xyscores[i][1]);
        if (xyscores[i][1] >= conf_threshold)
        {
            std::copy(cpu_output.begin() + i * 78, cpu_output.begin() + (i + 1) * 78, predictions[i].begin());
        }
    }
    std::cout << "predictions: " << predictions.size() << "\n";

    int n_offsets = 72;
    int n_strips = n_offsets - 1;
    int img_w = 800;
    int img_h = 320;
    int ori_img_w = 1280;
    int ori_img_h = 720;
    int cut_height = 160;
    std::vector<std::vector<float>> nms_predictions;
    for (auto &prediction : predictions)
    {
        prediction[5] = std::round(prediction[5] * n_strips);
        std::vector<float> predict(sizeof(Detection) - sizeof(float));
        for (size_t j = 0; j < 4; ++j)
        {
            predict[j] = prediction[j];
        }
        for (size_t j = 5; j < prediction.size(); ++j)
        {
            predict[j - 1] = prediction[j];
        }
        nms_predictions.emplace_back(predict);
    }
    std::cout << "nms_predictions: " << nms_predictions.size() << "\n";

    for (auto &nms_prediction : nms_predictions)
    {
        nms_prediction[4] *= n_strips;
        for (size_t i = 5; i < nms_prediction.size(); ++i)
        {
            nms_prediction[i] *= ori_img_w - 1;
        }
    }

    auto keep = Lane_nms(nms_predictions, scores, 50, 5);
    std::cout << "keep: " << keep.size() << "\n";

    std::vector<float> prior_ys(n_offsets);
    for (int i = 0; i < n_offsets; ++i)
    {
        prior_ys[i] = 1.0f - static_cast<float>(i) / n_strips;
    }

    auto lanes = predictions_to_pred(predictions, prior_ys, n_strips, ori_img_w, ori_img_h, img_w, img_h, cut_height);
    imshow_lanes(frame, lanes);
}

// initialize TensorRT engine and parse ONNX model --------------------------------------------------------------------
void parseOnnxModel(const std::string &model_path, TRTUniquePtr<nvinfer1::ICudaEngine> &engine,
                    TRTUniquePtr<nvinfer1::IExecutionContext> &context)
{
    TRTUniquePtr<nvinfer1::IBuilder> builder{nvinfer1::createInferBuilder(gLogger)};
    TRTUniquePtr<nvinfer1::INetworkDefinition> network{builder->createNetworkV2(1)};
    TRTUniquePtr<nvonnxparser::IParser> parser{nvonnxparser::createParser(*network, gLogger)};
    TRTUniquePtr<nvinfer1::IBuilderConfig> config{builder->createBuilderConfig()};
    // parse ONNX
    if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO)))
    {
        std::cerr << "ERROR: could not parse the model.\n";
        return;
    }
    // allow TensorRT to use up to 1GB of GPU memory for tactic selection.
    config->setMaxWorkspaceSize(1ULL << 30);
    // use FP16 mode if possible
    if (builder->platformHasFastFp16())
    {
        std::cout << "fp16\n";
        config->setFlag(nvinfer1::BuilderFlag::kFP16);
    }
    // we have only one image in batch
    builder->setMaxBatchSize(1);
    // generate TensorRT engine optimized for the target platform
    engine.reset(builder->buildEngineWithConfig(*network, *config));
    context.reset(engine->createExecutionContext());
}

// initialize TensorRT engine from serialized model --------------------------------------------------------------------
void loadTrtEngine(const std::string &engine_path, TRTUniquePtr<nvinfer1::IRuntime> &runtime,
                   TRTUniquePtr<nvinfer1::ICudaEngine> &engine,
                   TRTUniquePtr<nvinfer1::IExecutionContext> &context)
{
    std::ifstream engine_file(engine_path, std::ios::binary);
    if (!engine_file)
    {
        std::cerr << "ERROR: could not open the engine file.\n";
        return;
    }

    // 计算文件大小
    engine_file.seekg(0, engine_file.end);
    size_t file_size = engine_file.tellg();
    engine_file.seekg(0, engine_file.beg);

    // 加载文件内容到内存中
    std::vector<char> trt_model_stream(file_size);
    engine_file.read(trt_model_stream.data(), file_size);
    engine_file.close();

    // 创建runtime
    runtime.reset(nvinfer1::createInferRuntime(gLogger));

    // 反序列化计划文件并创建引擎
    engine.reset(runtime->deserializeCudaEngine(trt_model_stream.data(), file_size, nullptr));

    // 创建上下文
    context.reset(engine->createExecutionContext());
}

// main pipeline ------------------------------------------------------------------------------------------------------
int main(int argc, char *argv[])
{
    if (argc < 3)
    {
        std::cerr << "usage: " << argv[0] << " model.onnx image.jpg\n";
        return -1;
    }
    std::string model_path(argv[1]);
    std::string image_path(argv[2]);
    int batch_size = 1;

    // initialize TensorRT engine and parse ONNX model
    TRTUniquePtr<nvinfer1::IRuntime> runtime{nullptr};
    TRTUniquePtr<nvinfer1::ICudaEngine> engine{nullptr};
    TRTUniquePtr<nvinfer1::IExecutionContext> context{nullptr};
    // parseOnnxModel(model_path, engine, context);
    loadTrtEngine(model_path, runtime, engine, context);

    // get sizes of input and output and allocate memory required for input data and for output data
    std::vector<nvinfer1::Dims> input_dims;               // we expect only one input
    std::vector<nvinfer1::Dims> output_dims;              // and one output
    std::vector<void *> buffers(engine->getNbBindings()); // buffers for input and output data
    for (size_t i = 0; i < engine->getNbBindings(); ++i)
    {
        auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
        cudaMalloc(&buffers[i], binding_size);
        if (engine->bindingIsInput(i))
        {
            input_dims.emplace_back(engine->getBindingDimensions(i));
        }
        else
        {
            output_dims.emplace_back(engine->getBindingDimensions(i));
        }
    }
    if (input_dims.empty() || output_dims.empty())
    {
        std::cerr << "Expect at least one input and one output for network\n";
        return -1;
    }

    // read input image
    cv::Mat frame = cv::imread(image_path);
    if (frame.empty())
    {
        std::cerr << "Input image " << image_path << " load failed\n";
        return -1;
    }
    cv::cuda::GpuMat gpu_frame;
    // upload image to GPU
    gpu_frame.upload(frame);

    // preprocess input data
    preprocessImage(gpu_frame, (float *)buffers[0], input_dims[0]);
    // inference
    context->enqueue(batch_size, buffers.data(), 0, nullptr);
    // postprocess results
    postprocessResults(frame, (float *)buffers[1], output_dims[0], batch_size);

    for (void *buf : buffers)
    {
        cudaFree(buf);
    }
    return 0;
}