triton-inference-server / server

The Triton Inference Server provides an optimized cloud and edge inferencing solution.
https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/index.html
BSD 3-Clause "New" or "Revised" License
8.18k stars 1.46k forks source link

Custom backend using recommended.cc not generating correct output #7266

Closed jgrsdave closed 2 months ago

jgrsdave commented 4 months ago

Description we are trying to host preprocessing model using python/ custom backend. Python backend is giving very high latency even we are doing very basic look ups. hence we decided to build custom backend by taking reference recommended.cc https://github.com/triton-inference-server/backend/blob/main/examples/backends/recommended/src/recommended.cc Below is the modified code for recommended.cc for which output is properly computed but I am not getting that same output as response.

Triton Information 24.01

Are you using the Triton container or did you build it yourself? I have built custom triton image.

Describe the models (framework, inputs, outputs), ideally include the model configuration file (if using an ensemble include the model configuration file for that as well).

config.pbtxt file

backend: "recommended" max_batch_size: 8 dynamic_batching { max_queue_delay_microseconds: 5000000 } input [ { name: "INPUT" data_type: TYPE_STRING dims: [ 129 ] } ] output [ { name: "OUTPUT" data_type: TYPE_FP16 dims: [ 129 ] } ] instance_group [ { kind: KIND_CPU } ]

Execute method of std::vector feature_list = { "relevance_score" }

std::unordered_map<std::string, std::unordered_map<std::string, int>> categorical_index_mapping = { {"usergross_orders_bin30_days" , { {"other", 0},{"4-10 order", 1},{"2-4 order", 2},{"1-2 order", 3},{"10-20 order", 4},{"20+ order", 5} }} };

uint16_t doubleToFP16(double value) { uint64_t bits; std::memcpy(&bits, &value, sizeof(bits)); uint16_t sign = (bits >> 63) & 0x1; int16_t exponent = ((bits >> 52) & 0x7FF) - 1023 + 15; uint16_t fraction = (bits >> 42) & 0x3FF;

if (exponent <= 0) { exponent = 0; fraction = 0; } else if (exponent >= 31) { exponent = 31; fraction = 0; }

return (sign << 15) | (exponent << 10) | fraction; }

float FP16ToFloat(uint16_t fp16) { uint16_t sign = (fp16 >> 15) & 0x1; int16_t exponent = ((fp16 >> 10) & 0x1F) - 15 + 127; uint16_t fraction = fp16 & 0x3FF;

if (exponent <= 0) { exponent = 0; fraction = 0; } else if (exponent >= 255) { exponent = 255; fraction = 0; }

uint32_t bits = (sign << 31) | (exponent << 23) | (fraction << 13); float result; std::memcpy(&result, &bits, sizeof(result));

return result; }

void EncodeCategoricalData(std::vector& fp16_data, std::string cat_data, size_t num_rows, size_t num_cols) { for (size_t ind = 0; ind < num_cols; ++ind) { std::string column_name = feature_list[ind + 100]; // std::cout<< "column_name"<<column_name<<std::endl; auto& cat_col_feat = categorical_index_mapping[column_name]; for (size_t row = 0; row < num_rows; ++row) { std::string value = cat_data[row num_cols + ind+100]; // std::cout<< "value"<<value<<std::endl; int encoded_value = cat_col_feat[value]; // std::cout<< "encoded_value"<<value<<"\t"<<encoded_value<<std::endl; double doubleValue = static_cast(encoded_value); // std::cout<< "doubleValue"<<value<<"\t"<<doubleValue<<std::endl; // uint16_t encoded_fp16 = static_cast(encoded_value); fp16_data[row num_cols + ind] = FP16ToFloat(doubleToFP16(doubleValue)); //fp16_data[row num_cols + ind] = encoded_fp16; // std::cout<< "encoded_fp16"<<value<<"\t"<<encoded_fp16<<std::endl; // std::cout<< "fp16_data"<<value<<"\t"<<fp16_data[row * num_cols + ind]<<std::endl; } } }

TRITONSERVER_Error TRITONBACKEND_ModelInstanceExecute( TRITONBACKEND_ModelInstance instance, TRITONBACKEND_Request** requests, const uint32_t request_count) { // std::cout<<"entered TRITONBACKEND_ModelInstanceExecute"<<std::endl;

LOG_MESSAGE(
    TRITONSERVER_LOG_VERBOSE,
    (std::string("TRITONBACKEND_ModelExecute: Running ") +
     std::to_string(request_count) + " requests").c_str());

void* vstate;
RETURN_IF_ERROR(TRITONBACKEND_ModelInstanceState(instance, &vstate));
ModelInstanceState* instance_state =
    reinterpret_cast<ModelInstanceState*>(vstate);

ModelState* model_state = instance_state->StateForModel();

// std::cout<<"model state: "<<model_state<<std::endl; // std::cout<<"request count: "<<request_count<<std::endl;

TRITONBACKEND_Response* responses[request_count];

for (uint32_t r = 0; r < request_count; ++r) {
    responses[r] = nullptr;
    TRITONBACKEND_Request* request = requests[r];
    RETURN_IF_ERROR(TRITONBACKEND_ResponseNew(&responses[r], request));
}

std::vector<TRITONBACKEND_Input*> input_tensors(request_count);
std::vector<TRITONBACKEND_Output*> output_tensors(request_count);

for (uint32_t r = 0; r < request_count; ++r) {
 RETURN_IF_ERROR(TRITONBACKEND_RequestInput(
     requests[r], model_state->InputTensorName().c_str(), &input_tensors[r]));
 const char* output_name = model_state->OutputTensorName().c_str();
 RETURN_IF_ERROR(TRITONBACKEND_ResponseOutput(
     responses[r], &output_tensors[r], output_name,
     model_state->OutputTensorDataType(),
     model_state->OutputTensorShape().data(),
     model_state->OutputTensorShape().size()));

}

for (uint32_t r = 0; r < request_count; ++r) {
    TRITONBACKEND_Input* input_tensor = input_tensors[r];
    TRITONBACKEND_Output* output_tensor = output_tensors[r];

    const void* input_buffer;
    size_t input_byte_size;
    TRITONSERVER_MemoryType input_memory_type;
    int64_t input_memory_type_id;
    RETURN_IF_ERROR(TRITONBACKEND_InputBuffer(
        input_tensor, 0, &input_buffer, &input_byte_size, &input_memory_type,
        &input_memory_type_id));

 //   TRITONSERVER_DataType input_dtype = model_state->InputTensorDataType();
    TRITONSERVER_DataType output_dtype = model_state->OutputTensorDataType();
 //   std::cout<<"input_dtype: "<<input_dtype<<std::endl;
 //   std::cout<<"output_dtype: "<<output_dtype<<std::endl;
 //   std::cout<<"input_byte_size: "<<input_byte_size<<std::endl;

    size_t num_elements = 129;
 //   std::cout<<"num_elements: "<<num_elements<<std::endl;

 //   std::vector<int64_t> input_shape = model_state->InputTensorShape();
 //   std::vector<int64_t> output_shape = model_state->OutputTensorShape();
//    std::cout<<"input_shape: "<<input_shape.size()<<std::endl;
 //   std::cout<<"output_shape: "<<output_shape.size()<<std::endl;
    std::vector<std::string> input_data(num_elements);
//    std::cout<<"input_data: "<<std::endl;
    std::vector<float> input_data_1(num_elements);
//    std::cout<<"input_data_1: "<<std::endl;
    std::vector<float> encoded_data(17);
//    std::cout<<"encoded_data: "<<std::endl;
    std::vector<std::string> string_data;

//    std::cout<<"inside if :"<<std::endl;
    const char* input_data_ptr = reinterpret_cast<const char*>(input_buffer);
    size_t offset = 0;
    size_t i=0;
    while (offset < input_byte_size) {
        uint32_t length = 0;
        memcpy(&length, input_data_ptr + offset, sizeof(uint32_t));
        offset += sizeof(uint32_t);
 //       std::cout<<"offset1: "<<offset<<std::endl;
        string_data.emplace_back(input_data_ptr + offset, length);
        offset += length;
 //       std::cout<<"offset2: "<<offset<<std::endl;
        input_data[i] = string_data.back();
        std::cout << "input string data: " <<  input_data[i]  << std::endl;
        i++;
      }

    for (uint32_t ind = 0; ind < 100; ind++) {
      double value = std::stod(input_data[ind]);
      input_data_1[ind] = FP16ToFloat(doubleToFP16(value));
      std::cout<<"input_data_1 : "<<value<<"\t"<<input_data_1[ind]<<std::endl;
    }

    EncodeCategoricalData(encoded_data,input_data.data(), 1, 17);

    std::cout<<"processing Last 17 started : "<<std::endl;
    for (uint32_t ind = 0; ind < 17; ind++) {
 //    std::cout<<"input_data_2 : "<<ind<<"\t"<<encoded_data[ind]<<std::endl;
     input_data_1[ind+100] = encoded_data[ind];
     std::cout<<"input_data_4 : "<<ind<<"\t"<<input_data_1[ind+100]<<std::endl;
    }

    for (uint32_t ind = 117; ind < 129; ind++) {
     double value = std::stod(input_data[ind]);
     input_data_1[ind] = FP16ToFloat(doubleToFP16(value));
     std::cout<<"input_data_3 : "<<value<<"\t"<<input_data_1[ind]<<std::endl;
    }

    size_t output_byte_size = num_elements * TRITONSERVER_DataTypeByteSize(output_dtype);
    std::cout<<"output_byte_size: "<<output_byte_size<<std::endl;
    void* output_buffer;
    TRITONSERVER_MemoryType output_memory_type;
    int64_t output_memory_type_id;
    RETURN_IF_ERROR(TRITONBACKEND_OutputBuffer(
        output_tensor, &output_buffer, output_byte_size, &output_memory_type,
        &output_memory_type_id));

    float* output_data_ptr = reinterpret_cast<float*>(output_buffer);
    for (size_t i = 0; i < num_elements; ++i) {
      std::cout<<"entered in output_data : "<<std::endl;
      output_data_ptr[i] = input_data_1[i];
      std::cout<<"output_data : "<<i<<"\t"<<output_data_ptr[i]<<std::endl;
    }
}
std::cout<<"ok bye : "<<std::endl;
for (uint32_t r = 0; r < request_count; ++r) {
    RETURN_IF_ERROR(TRITONBACKEND_ResponseSend(
        responses[r], TRITONSERVER_RESPONSE_COMPLETE_FINAL, nullptr));
}
std::cout<<"final bye : "<<std::endl;
return nullptr;

}

Expected behavior output is properly generate in input_data_1 but same output I am not getting in response.

any lead/help is appreciated. TIA.

statiraju commented 4 months ago

[DLIS-6788] created

jgrsdave commented 2 months ago

@statiraju can you please let me know if there is any update.