I am trying to add a second head to yolov5s and I get cuda failure: 1 error inside context.enqueue() function. Here is my steps to add the second output to network and maybe someone can tell me what is wrong with it.

The new head takes the output of bottleneck_csp9 as input, applies couple of convolutional layers and finally spits out a fully connected layer with 7 neurons. Inside createEngine_s() function, I created this new head and added the head's 7-neuron output to the outputs of the network as (det3 is the 7-neuron fully connected layer):

det3->getOutput(0)->setName(WHOLESCENE_OUTPUT_BLOB_NAME); network->markOutput(*det3->getOutput(0));

Then, I allocate a float array for the second output as static float wholescene[BATCH_SIZE * OUTPUT_SIZE_WHOLESCENE]; where OUTPUT_SIZE_WHOLESCENE is 7.
Since now I have 1 input and 2 outputs I initialize buffers as void* buffers[3]; .
Then I get the index for the second output as const int outputIndex2 = engine->getBindingIndex(WHOLESCENE_OUTPUT_BLOB_NAME); .
I allocate GPU memory for the second output as CHECK(cudaMalloc(&buffers[outputIndex2], BATCH_SIZE * OUTPUT_SIZE_WHOLESCENE * sizeof(float)));

Then I start the inference with this edited doInference() function:


void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, float* wholescene, int batchSize) {

// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context.enqueue(batchSize, buffers, stream, nullptr);
CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
CHECK(cudaMemcpyAsync(wholescene, buffers[2], batchSize * OUTPUT_SIZE_WHOLESCENE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}


And as I said, ` context.enqueue(batchSize, buffers, stream, nullptr);` gives the cuda failure: 1 error. Can someone tell what is wrong with this?

Here is the complete yolov5.cpp that I use:

include

include "cuda_runtime_api.h"

include "logging.h"

include "common.hpp"

define USE_FP16 // comment out this if want to use FP32

define DEVICE 0 // GPU id

define NMS_THRESH 0.65

define CONF_THRESH 0.4

define BATCH_SIZE 1

define N_CLASS_WHOLESCENE 7

define NET s // s m l x

define NETSTRUCT(str) createEngine_##str

define CREATENET(net) NETSTRUCT(net)

define STR1(x) #x

define STR2(x) STR1(x)

// stuff we know about the network and the input/output blobs static const int INPUT_H = Yolo::INPUT_H; static const int INPUT_W = Yolo::INPUT_W; static const int CLASS_NUM = Yolo::CLASS_NUM; static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT sizeof(Yolo::Detection) / sizeof(float) + 1; // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1 static const int OUTPUT_SIZE_WHOLESCENE = N_CLASS_WHOLESCENE + 1; const char INPUT_BLOB_NAME = "data"; const char OUTPUT_BLOB_NAME = "prob"; const char WHOLESCENE_OUTPUT_BLOB_NAME = "wholescene_output"; static Logger gLogger;

// Creat the engine using only the API and not any parser. ICudaEngine createEngine_s(unsigned int maxBatchSize, IBuilder builder, IBuilderConfig config, DataType dt) { INetworkDefinition network = builder->createNetworkV2(0U);

// Create input tensor of shape {3, INPUT_H, INPUT_W} with name INPUT_BLOB_NAME
ITensor* data = network->addInput(INPUT_BLOB_NAME, dt, Dims3{ 3, INPUT_H, INPUT_W });
assert(data);

std::map<std::string, Weights> weightMap = loadWeights("../yolov5s.wts");
std::map<std::string, Weights> weightMap_ws = loadWeights("../MultiHead.wts");
Weights emptywts{ DataType::kFLOAT, nullptr, 0 };

// yolov5 backbone
auto focus0 = focus(network, weightMap, *data, 3, 32, 3, "model.0");
auto conv1 = convBlock(network, weightMap, *focus0->getOutput(0), 64, 3, 2, 1, "model.1");
auto bottleneck_CSP2 = bottleneckCSP(network, weightMap, *conv1->getOutput(0), 64, 64, 1, true, 1, 0.5, "model.2");
auto conv3 = convBlock(network, weightMap, *bottleneck_CSP2->getOutput(0), 128, 3, 2, 1, "model.3");
auto bottleneck_csp4 = bottleneckCSP(network, weightMap, *conv3->getOutput(0), 128, 128, 3, true, 1, 0.5, "model.4");
auto conv5 = convBlock(network, weightMap, *bottleneck_csp4->getOutput(0), 256, 3, 2, 1, "model.5");
auto bottleneck_csp6 = bottleneckCSP(network, weightMap, *conv5->getOutput(0), 256, 256, 3, true, 1, 0.5, "model.6");
auto conv7 = convBlock(network, weightMap, *bottleneck_csp6->getOutput(0), 512, 3, 2, 1, "model.7");
auto spp8 = SPP(network, weightMap, *conv7->getOutput(0), 512, 512, 5, 9, 13, "model.8");

// yolov5 head
auto bottleneck_csp9 = bottleneckCSP(network, weightMap, *spp8->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.9");
auto conv10 = convBlock(network, weightMap, *bottleneck_csp9->getOutput(0), 256, 1, 1, 1, "model.10");

float *deval = reinterpret_cast<float*>(malloc(sizeof(float) * 256 * 2 * 2));
for (int i = 0; i < 256 * 2 * 2; i++) {
    deval[i] = 1.0;
}
Weights deconvwts11{ DataType::kFLOAT, deval, 256 * 2 * 2 };
IDeconvolutionLayer* deconv11 = network->addDeconvolutionNd(*conv10->getOutput(0), 256, DimsHW{ 2, 2 }, deconvwts11, emptywts);
deconv11->setStrideNd(DimsHW{ 2, 2 });
deconv11->setNbGroups(256);
weightMap["deconv11"] = deconvwts11;

ITensor* inputTensors12[] = { deconv11->getOutput(0), bottleneck_csp6->getOutput(0) };
auto cat12 = network->addConcatenation(inputTensors12, 2);
auto bottleneck_csp13 = bottleneckCSP(network, weightMap, *cat12->getOutput(0), 512, 256, 1, false, 1, 0.5, "model.13");
auto conv14 = convBlock(network, weightMap, *bottleneck_csp13->getOutput(0), 128, 1, 1, 1, "model.14");

Weights deconvwts15{ DataType::kFLOAT, deval, 128 * 2 * 2 };
IDeconvolutionLayer* deconv15 = network->addDeconvolutionNd(*conv14->getOutput(0), 128, DimsHW{ 2, 2 }, deconvwts15, emptywts);
deconv15->setStrideNd(DimsHW{ 2, 2 });
deconv15->setNbGroups(128);

ITensor* inputTensors16[] = { deconv15->getOutput(0), bottleneck_csp4->getOutput(0) };
auto cat16 = network->addConcatenation(inputTensors16, 2);
auto bottleneck_csp17 = bottleneckCSP(network, weightMap, *cat16->getOutput(0), 256, 128, 1, false, 1, 0.5, "model.17");
IConvolutionLayer* det0 = network->addConvolutionNd(*bottleneck_csp17->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.0.weight"], weightMap["model.24.m.0.bias"]);

auto conv18 = convBlock(network, weightMap, *bottleneck_csp17->getOutput(0), 128, 3, 2, 1, "model.18");
ITensor* inputTensors19[] = { conv18->getOutput(0), conv14->getOutput(0) };
auto cat19 = network->addConcatenation(inputTensors19, 2);
auto bottleneck_csp20 = bottleneckCSP(network, weightMap, *cat19->getOutput(0), 256, 256, 1, false, 1, 0.5, "model.20");
IConvolutionLayer* det1 = network->addConvolutionNd(*bottleneck_csp20->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.1.weight"], weightMap["model.24.m.1.bias"]);

auto conv21 = convBlock(network, weightMap, *bottleneck_csp20->getOutput(0), 256, 3, 2, 1, "model.21");
ITensor* inputTensors22[] = { conv21->getOutput(0), conv10->getOutput(0) };
auto cat22 = network->addConcatenation(inputTensors22, 2);
auto bottleneck_csp23 = bottleneckCSP(network, weightMap, *cat22->getOutput(0), 512, 512, 1, false, 1, 0.5, "model.23");
IConvolutionLayer* det2 = network->addConvolutionNd(*bottleneck_csp23->getOutput(0), 3 * (Yolo::CLASS_NUM + 5), DimsHW{ 1, 1 }, weightMap["model.24.m.2.weight"], weightMap["model.24.m.2.bias"]);

// yolov5 whole scene classifier
auto conv_ws_1 = convBlockRelu(network, weightMap_ws, *bottleneck_csp9->getOutput(0), 128, 5, 1, 0, "conv1");
auto conv_ws_1_pool = pooling(network, *conv_ws_1->getOutput(0), PoolingType::kMAX, 2, 2, 0);
auto conv_ws_2 = convBlockRelu(network, weightMap_ws, *conv_ws_1_pool->getOutput(0), 32, 3, 1, 0, "conv2");
auto conv_ws_2_pool = pooling(network, *conv_ws_2->getOutput(0), PoolingType::kAVERAGE, 2, 3, 0);
auto fc_ws_1 = fullyConnectedRelu(network, weightMap_ws, *conv_ws_2_pool->getOutput(0), 256, "fc1");
auto fc_ws_2 = fullyConnectedRelu(network, weightMap_ws, *fc_ws_1->getOutput(0), 128, "fc2");
IFullyConnectedLayer* det3 = fullyConnected(network, weightMap_ws, *fc_ws_2->getOutput(0), N_CLASS_WHOLESCENE, "fc3");

auto yolo = addYoLoLayer(network, weightMap, det0, det1, det2);
yolo->getOutput(0)->setName(OUTPUT_BLOB_NAME);
network->markOutput(*yolo->getOutput(0));

det3->getOutput(0)->setName(WHOLESCENE_OUTPUT_BLOB_NAME);
network->markOutput(*det3->getOutput(0));

// Build engine
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(512 * (1 << 20));  // 512MB

ifdef USE_FP16

config->setFlag(BuilderFlag::kFP16);

endif

std::cout << "Building engine, please wait for a while..." << std::endl;
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "Build engine successfully!" << std::endl;

// Don't need the network any more
network->destroy();

// Release host memory
for (auto& mem : weightMap)
{
    free((void*)(mem.second.values));

}
return engine;

}

void APIToModel(unsigned int maxBatchSize, IHostMemory* modelStream) { // Create builder IBuilder builder = createInferBuilder(gLogger); IBuilderConfig* config = builder->createBuilderConfig();

// Create model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = (CREATENET(NET))(maxBatchSize, builder, config, DataType::kFLOAT);
//ICudaEngine* engine = createEngine(maxBatchSize, builder, config, DataType::kFLOAT);
assert(engine != nullptr);

// Serialize the engine
(*modelStream) = engine->serialize();

// Close everything down
engine->destroy();
builder->destroy();

}

void doInference(IExecutionContext& context, cudaStream_t& stream, void *buffers, float input, float output, float wholescene, int batchSize) { // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host CHECK(cudaMemcpyAsync(buffers[0], input, batchSize 3 INPUT_H INPUT_W sizeof(float), cudaMemcpyHostToDevice, stream)); context.enqueue(batchSize, buffers, stream, nullptr); CHECK(cudaMemcpyAsync(output, buffers[1], batchSize OUTPUT_SIZE sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(wholescene, buffers[2], batchSize OUTPUT_SIZE_WHOLESCENE sizeof(float), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); }

int main(int argc, char* argv) { cudaSetDevice(DEVICE); // create a model using the API directly and serialize it to a stream char trtModelStream{ nullptr }; size_t size{ 0 }; std::string engine_name = STR2(NET); engine_name = "yolov5" + engine_name + ".engine"; if (argc == 2 && std::string(argv[1]) == "-s") { IHostMemory modelStream{ nullptr }; APIToModel(BATCH_SIZE, &modelStream); assert(modelStream != nullptr); std::ofstream p(engine_name, std::ios::binary); if (!p) { std::cerr << "could not open plan output file" << std::endl; return -1; } p.write(reinterpret_cast<const char>(modelStream->data()), modelStream->size()); modelStream->destroy(); return 0; } else if (argc == 3 && std::string(argv[1]) == "-d") { std::ifstream file(engine_name, std::ios::binary); if (file.good()) { file.seekg(0, file.end); size = file.tellg(); file.seekg(0, file.beg); trtModelStream = new char[size]; assert(trtModelStream); file.read(trtModelStream, size); file.close(); } } else { std::cerr << "arguments not right!" << std::endl; std::cerr << "./yolov5 -s // serialize model to plan file" << std::endl; std::cerr << "./yolov5 -d ../samples // deserialize plan file and run inference" << std::endl; return -1; }

std::string file_name = argv[2];

// prepare input data ---------------------------
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
//for (int i = 0; i < 3 * INPUT_H * INPUT_W; i++)
//    data[i] = 1.0;
static float prob[BATCH_SIZE * OUTPUT_SIZE];
static float wholescene[BATCH_SIZE * OUTPUT_SIZE_WHOLESCENE];
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size);
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
assert(engine->getNbBindings() == 2);
void* buffers[3];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = engine->getBindingIndex(INPUT_BLOB_NAME);
const int outputIndex = engine->getBindingIndex(OUTPUT_BLOB_NAME);
const int outputIndex2 = engine->getBindingIndex(WHOLESCENE_OUTPUT_BLOB_NAME);
assert(inputIndex == 0);
assert(outputIndex == 1);
assert(outputIndex2 == 2);
// Create GPU buffers on device
CHECK(cudaMalloc(&buffers[inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
CHECK(cudaMalloc(&buffers[outputIndex2], BATCH_SIZE * OUTPUT_SIZE_WHOLESCENE * sizeof(float)));
// Create stream
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));

cv::Mat img = cv::imread(file_name);
cv::Mat pr_img = preprocess_img(img); // letterbox BGR to RGB
int i = 0;
for (int row = 0; row < INPUT_H; ++row) {
    uchar* uc_pixel = pr_img.data + row * pr_img.step;
    for (int col = 0; col < INPUT_W; ++col) {
        data[i] = (float)uc_pixel[2] / 255.0;
        data[i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
        data[i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
        uc_pixel += 3;
        ++i;
    }
}

// Run inference
for (int j=0; j<2; ++j)
{
    auto start = std::chrono::system_clock::now();
    doInference(*context, stream, buffers, data, prob, wholescene, BATCH_SIZE);
    auto end = std::chrono::system_clock::now();
    std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
}
std::vector<Yolo::Detection> res;
nms(res, prob, CONF_THRESH, NMS_THRESH);

for (int i=0; i<N_CLASS_WHOLESCENE; ++i)
    std::cout << wholescene[i] << " ";
std::cout << std::endl;

//std::cout << res.size() << std::endl;
for (size_t j = 0; j < res.size(); j++) {
    cv::Rect r = get_rect(img, res[j].bbox);
    cv::rectangle(img, r, cv::Scalar(0x27, 0xC1, 0x36), 2);
    cv::putText(img, std::to_string((int)res[j].class_id), cv::Point(r.x, r.y - 1), cv::FONT_HERSHEY_PLAIN, 1.2, cv::Scalar(0xFF, 0xFF, 0xFF), 2);
}
const std::string fname = "/home/kyilmaz2/src/tensorrtx/yolov5-multihead/yolov5/build/output.jpg";
cv::imwrite(fname, img);

// Release stream and buffers
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[inputIndex]));
CHECK(cudaFree(buffers[outputIndex]));
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();

// Print histogram of the output distribution
//std::cout << "\nOutput:\n\n";
//for (unsigned int i = 0; i < OUTPUT_SIZE; i++)
//{
//    std::cout << prob[i] << ", ";
//    if (i % 10 == 0) std::cout << std::endl;
//}
//std::cout << std::endl;

return 0;

}

wang-xinyu / tensorrtx

Adding a second output to yolov5 #268

include

include

include "cuda_runtime_api.h"

include "logging.h"

include "common.hpp"

define USE_FP16 // comment out this if want to use FP32

define DEVICE 0 // GPU id

define NMS_THRESH 0.65

define CONF_THRESH 0.4

define BATCH_SIZE 1

define N_CLASS_WHOLESCENE 7

define NET s // s m l x

define NETSTRUCT(str) createEngine_##str

define CREATENET(net) NETSTRUCT(net)

define STR1(x) #x

define STR2(x) STR1(x)

ifdef USE_FP16

endif