Fastestdet在Windows上使用Gpu推理时报错

error log | 日志或报错信息 | ログ

[0 Intel(R) HD Graphics 620] queueC=0[1] queueG=0[1] queueT=0[1] [0 Intel(R) HD Graphics 620] bugsbn1=0 bugbilz=173 bugcopc=0 bugihfa=0 [0 Intel(R) HD Graphics 620] fp16-p/s/a=1/1/1 int8-p/s/a=1/1/1 [0 Intel(R) HD Graphics 620] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 [1 GeForce MX150] queueC=2[8] queueG=0[16] queueT=1[2] [1 GeForce MX150] bugsbn1=0 bugbilz=173 bugcopc=0 bugihfa=0 [1 GeForce MX150] fp16-p/s/a=1/1/0 int8-p/s/a=1/1/1 [1 GeForce MX150] subgroup=32 basic=1 vote=1 ballot=1 shuffle=1 compile spir-v module failed ERROR: 0:76: 'limitations' : Non-constant-index-expression ERROR: 0:76: 'limitations' : Non-constant-index-expression ERROR: 0:85: 'limitations' : Non-constant-index-expression ERROR: 0:96: 'limitations' : Non-constant-index-expression ERROR: 0:96: 'limitations' : Non-constant-index-expression ERROR: 0:110: 'limitations' : Non-constant-index-expression ERROR: 0:110: 'limitations' : Non-constant-index-expression ERROR: 0:129: 'limitations' : Non-constant-index-expression ERROR: 0:129: 'limitations' : Non-constant-index-expression ERROR: 0:138: 'limitations' : Non-constant-index-expression ERROR: 0:151: 'limitations' : Non-constant-index-expression ERROR: 0:151: 'limitations' : Non-constant-index-expression ERROR: 0:169: 'limitations' : Non-constant-index-expression ERROR: 0:169: 'limitations' : Non-constant-index-expression ERROR: 0:189: 'limitations' : Non-constant-index-expression ERROR: 0:189: 'limitations' : Non-constant-index-expression ERROR: 0:197: 'limitations' : Non-constant-index-expression ERROR: 0:197: 'limitations' : Non-constant-index-expression ERROR: 0:206: 'limitations' : Non-constant-index-expression ERROR: 0:220: 'limitations' : Non-constant-index-expression ERROR: 0:220: 'limitations' : Non-constant-index-expression ERROR: 0:241: 'limitations' : Non-constant-index-expression ERROR: 0:241: 'limitations' : Non-constant-index-expression ERROR: 23 compilation errors. No code generated.

compile_spirv_module failed -1 create_shader_module failed compile spir-v module failed ERROR: 0:87: 'limitations' : Non-constant-index-expression ERROR: 0:87: 'limitations' : Non-constant-index-expression ERROR: 0:95: 'limitations' : Non-constant-index-expression ERROR: 0:95: 'limitations' : Non-constant-index-expression ERROR: 0:104: 'limitations' : Non-constant-index-expression ERROR: 5 compilation errors. No code generated. 还有很多重复性质的内容，不再重复列出

context | 编译/运行环境 | バックグラウンド

Windows 11 msvc2022 32位编译 ncnn 版本 20221128 msvc2022 opencv 版本 mobile-opencv 4.6.0 msvc2022 vulkan 版本 1.3.224.1

显卡 MX150&Intel UHD620

how to reproduce | 复现步骤 | 再現方法

1.使用对应的ncnn库 2.编译，运行 3.以GPU方式载入对应的模型库 4.输出以上信息

more | 其他 | その他

代码(临时写的胶水代码，大佬误喷) 其本质应与fastestdet官方所提供demo中代码无区别虽然这个问题不大，但是很好奇为什么

#include "pch.h"

struct Object
{
    cv::Rect_<float> rect;
    int label;
    float prob;
};

struct _fastestdet
{
    ncnn::Net net;

    std::vector<unsigned char> param;
    std::vector<unsigned char> model;
};

typedef _fastestdet* __fastestdet;

float Sigmoid(float x)
{
    return 1.0f / (1.0f + exp(-x));
}

float Tanh(float x)
{
    return 2.0f / (1.0f + exp(-2 * x)) - 1;
}

class TargetBox
{
private:
    float GetWidth() { return (x2 - x1); };
    float GetHeight() { return (y2 - y1); };

public:
    int x1;
    int y1;
    int x2;
    int y2;

    int category;
    float score;

    float area() { return GetWidth() * GetHeight(); };
};

float IntersectionArea(const TargetBox& a, const TargetBox& b)
{
    if (a.x1 > b.x2 || a.x2 < b.x1 || a.y1 > b.y2 || a.y2 < b.y1)
    {
        // no intersection
        return 0.f;
    }

    float inter_width = std::min(a.x2, b.x2) - std::max(a.x1, b.x1);
    float inter_height = std::min(a.y2, b.y2) - std::max(a.y1, b.y1);

    return inter_width * inter_height;
}

bool scoreSort(TargetBox a, TargetBox b)
{
    return (a.score > b.score);
}

//NMS处理
int nmsHandle(std::vector<TargetBox>& src_boxes, std::vector<TargetBox>& dst_boxes, const float nms_threshold)
{
    std::vector<int> picked;

    sort(src_boxes.begin(), src_boxes.end(), scoreSort);

    for (int i = 0; i < src_boxes.size(); i++)
    {
        int keep = 1;
        for (int j = 0; j < picked.size(); j++)
        {
            //交集
            float inter_area = IntersectionArea(src_boxes[i], src_boxes[picked[j]]);
            //并集
            float union_area = src_boxes[i].area() + src_boxes[picked[j]].area() - inter_area;
            float IoU = inter_area / union_area;

            if (IoU > nms_threshold && src_boxes[i].category == src_boxes[picked[j]].category)
            {
                keep = 0;
                break;
            }
        }

        if (keep) {
            picked.push_back(i);
        }
    }

    for (int i = 0; i < picked.size(); i++)
    {
        dst_boxes.push_back(src_boxes[picked[i]]);
    }

    return 0;
}

extern "C" __declspec(dllexport) __fastestdet __stdcall fastestdet_Init(const unsigned char* mem_param, const int size_param, const unsigned char* mem_model, const int size_model, const bool use_vulkan)
{
    if (use_vulkan && ncnn::get_gpu_count() == 0)
    {
        // no gpu
        std::cout << "[fastestdet]Err Your GPU count is Zero" << std::endl;
        return NULL;
    }

    _fastestdet* fastestdetNet = new _fastestdet;

    fastestdetNet->net.opt.use_vulkan_compute = use_vulkan;
    fastestdetNet->net.opt.num_threads = ncnn::get_big_cpu_count();

    fastestdetNet->param.clear();
    fastestdetNet->model.clear();

    fastestdetNet->param.insert(fastestdetNet->param.end(), mem_param, mem_param + size_param);
    fastestdetNet->model.insert(fastestdetNet->model.end(), mem_model, mem_model + size_model);

    fastestdetNet->param.push_back(0);

    if (fastestdetNet->net.load_param_mem((char*)fastestdetNet->param.data()) != 0)
    {
        std::cout << "[fastestdet]Err Read Param Failed" << std::endl;
        delete fastestdetNet;
        return NULL;
    }
    if (fastestdetNet->net.load_model(fastestdetNet->model.data()) == 0)
    {
        std::cout << "[fastestdet]Err Read Model Failed" << std::endl;
        delete fastestdetNet;
        return NULL;
    }
    return fastestdetNet;
}

extern "C" int __declspec(dllexport) __stdcall fastestdet_Deal(__fastestdet fastestdet, const unsigned char* img_src, const int img_size, const int target_size, const float prob_threshold, const float nms_threshold,const int class_num, Object * *ResList)
{
    if (fastestdet == NULL)
    {
        std::cout << "[fastestdet]Not Init" << std::endl;
        return 0;
    }

    cv::_InputArray pic_arr(img_src, img_size);
    cv::Mat src_mat = cv::imdecode(pic_arr, cv::IMREAD_UNCHANGED);

    if (src_mat.empty())
    {
        std::cout << "[fastestdet]ERR Cant Read Img" << std::endl;
        return 0;
    }

    /*
        unsigned char *mat_data;
        if (!BMP24TOMAT(img_src, &mat_data))
        {
            std::cout << "[Yolov7]Err Read BMP Failed" << endl;
            return 0;
        }
        int img_w, img_h;
        GetBMPSize(img_src, img_w, img_h);
    */

    int img_width = src_mat.cols;
    int img_height = src_mat.rows;

    // resize of input image data
    ncnn::Mat input = ncnn::Mat::from_pixels_resize(src_mat.data, ncnn::Mat::PIXEL_BGR, \
        src_mat.cols, src_mat.rows, target_size, target_size);
    // Normalization of input image data
    const float mean_vals[3] = { 0.f, 0.f, 0.f };
    const float norm_vals[3] = { 1 / 255.f, 1 / 255.f, 1 / 255.f };
    input.substract_mean_normalize(mean_vals, norm_vals);

    // creat extractor
    ncnn::Extractor ex = fastestdet->net.create_extractor();
    ex.set_num_threads(1);

    //set input tensor
    ex.input("input.1", input);

    // get output tensor
    ncnn::Mat output;
    ex.extract("758", output);

    // handle output tensor
    std::vector<TargetBox> target_boxes;

    for (int h = 0; h < output.h; h++)
    {
        for (int w = 0; w < output.h; w++)
        {
            // 前景概率
            int obj_score_index = (0 * output.h * output.w) + (h * output.w) + w;
            float obj_score = output[obj_score_index];

            // 解析类别
            int category;
            float max_score = 0.0f;
            for (size_t i = 0; i < class_num; i++)
            {
                int obj_score_index = ((5 + i) * output.h * output.w) + (h * output.w) + w;
                float cls_score = output[obj_score_index];
                if (cls_score > max_score)
                {
                    max_score = cls_score;
                    category = i;
                }
            }
            float score = pow(max_score, 0.4) * pow(obj_score, 0.6);

            // 阈值筛选
            if (score > prob_threshold)
            {
                // 解析坐标
                int x_offset_index = (1 * output.h * output.w) + (h * output.w) + w;
                int y_offset_index = (2 * output.h * output.w) + (h * output.w) + w;
                int box_width_index = (3 * output.h * output.w) + (h * output.w) + w;
                int box_height_index = (4 * output.h * output.w) + (h * output.w) + w;

                float x_offset = Tanh(output[x_offset_index]);
                float y_offset = Tanh(output[y_offset_index]);
                float box_width = Sigmoid(output[box_width_index]);
                float box_height = Sigmoid(output[box_height_index]);

                float cx = (w + x_offset) / output.w;
                float cy = (h + y_offset) / output.h;

                int x1 = (int)((cx - box_width * 0.5) * img_width);
                int y1 = (int)((cy - box_height * 0.5) * img_height);
                int x2 = (int)((cx + box_width * 0.5) * img_width);
                int y2 = (int)((cy + box_height * 0.5) * img_height);

                target_boxes.push_back(TargetBox{ x1, y1, x2, y2, category, score });
            }
        }
    }

    // NMS处理
    std::vector<TargetBox> nms_boxes;
    nmsHandle(target_boxes, nms_boxes, nms_threshold);

    if (nms_boxes.size() == 0)
        return 0;

    *ResList = new Object[nms_boxes.size()];
    // std::cout << "[DEBUG] " << proposals.size() * sizeof(Object) <<' '<<_msize(*ResList) << std::endl;
    for (int i = 0; i < nms_boxes.size(); i++)
    {
        (*ResList)[i].label = nms_boxes[i].category;
        (*ResList)[i].prob = nms_boxes[i].score;
        (*ResList)[i].rect.x = nms_boxes[i].x1;
        (*ResList)[i].rect.y = nms_boxes[i].y1;
        (*ResList)[i].rect.width = nms_boxes[i].x2 - nms_boxes[i].x1;
        (*ResList)[i].rect.height = nms_boxes[i].y2 - nms_boxes[i].y1;
    }
    return nms_boxes.size();
}

extern "C" void __declspec(dllexport) __stdcall fastestdet_DestructRet(Object * ResList)
{
    delete[] ResList;
}

extern "C" void __declspec(dllexport) __stdcall fastestdet_Destroy(__fastestdet fastestdet)
{
    fastestdet->net.clear();
    delete fastestdet;
}

Tencent / ncnn