float *current_person = output + index * out_size;
for (int number = 0; number < m_num_key_points; number++) {
float *current_point = current_person + feature_size * number;
auto max_pos = std::max_element(current_point, current_point + feature_size);
// assume max_pos is current_point + feature_size
// assume number = num_key_points - 1 i.e. the last number in [0...num_key_points)
// assume we are at the last feature, i.e. already at current_position = output + (batch_size - 1) * out_size;
// allocated size of float* out is feature_size * num_keypoints * batch_size = out_size * batch_size
// that means that max_pos = output + (batch_size - 1) * out_size + feature_size * num_key_points
// = output + batch_size * out_size <- already bounds the allocated memory
// then on top of that you try to add (max_pos + IMAGE_WIDTH / 4): output + batch_size * out_size + IMAGE_WIDTH / 4 > max element for output + batch_size * out_size last (element position)
// same out of bound access happens with the lower bound when you subtract IMAGE_WITH / 4
float x = (max_pos - current_point) % (IMAGE_WIDTH / 4) + (*(max_pos + 1) > *(max_pos - 1) ? 0.25 : -0.25);
float y = (max_pos - current_point) / (IMAGE_WIDTH / 4) + (*(max_pos + IMAGE_WIDTH / 4) > *(max_pos - IMAGE_WIDTH / 4) ? 0.25 : -0.25);
//
}
https://github.com/linghu8812/tensorrt_inference/blob/master/code/src/mmpose.cpp#L75