Closed umairjavaid closed 3 years ago
This is how the yolov5 tensorrt model is implemented in the tensorrtx repo. Can you help me out how can I integrate this your code? Also, if possible, can you point out the difference in implementation in your and their code?
class YoLov5TRT(object):
"""
description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
"""
def __init__(self, engine_file_path):
# Create a Context on this device,
self.ctx = cuda.Device(0).make_context()
stream = cuda.Stream()
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
# Deserialize the engine from file
with open(engine_file_path, "rb") as f:
engine = runtime.deserialize_cuda_engine(f.read())
context = engine.create_execution_context()
host_inputs = []
cuda_inputs = []
host_outputs = []
cuda_outputs = []
bindings = []
for binding in engine:
print('bingding:', binding, engine.get_binding_shape(binding))
size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
dtype = trt.nptype(engine.get_binding_dtype(binding))
# Allocate host and device buffers
host_mem = cuda.pagelocked_empty(size, dtype)
cuda_mem = cuda.mem_alloc(host_mem.nbytes)
# Append the device buffer to device bindings.
bindings.append(int(cuda_mem))
# Append to the appropriate list.
if engine.binding_is_input(binding):
self.input_w = engine.get_binding_shape(binding)[-1]
self.input_h = engine.get_binding_shape(binding)[-2]
host_inputs.append(host_mem)
cuda_inputs.append(cuda_mem)
else:
host_outputs.append(host_mem)
cuda_outputs.append(cuda_mem)
# Store
self.stream = stream
self.context = context
self.engine = engine
self.host_inputs = host_inputs
self.cuda_inputs = cuda_inputs
self.host_outputs = host_outputs
self.cuda_outputs = cuda_outputs
self.bindings = bindings
self.batch_size = engine.max_batch_size
def infer(self, raw_image_generator):
threading.Thread.__init__(self)
# Make self the active context, pushing it on top of the context stack.
self.ctx.push()
# Restore
stream = self.stream
context = self.context
engine = self.engine
host_inputs = self.host_inputs
cuda_inputs = self.cuda_inputs
host_outputs = self.host_outputs
cuda_outputs = self.cuda_outputs
bindings = self.bindings
# Do image preprocess
batch_image_raw = []
batch_origin_h = []
batch_origin_w = []
batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
for i, image_raw in enumerate(raw_image_generator):
input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
batch_image_raw.append(image_raw)
batch_origin_h.append(origin_h)
batch_origin_w.append(origin_w)
np.copyto(batch_input_image[i], input_image)
print("batch_input_image.shape: ",batch_input_image.shape)
cv2.imshow("batch_input_image.shape", batch_input_image.shape)
batch_input_image = np.ascontiguousarray(batch_input_image)
# Copy input image to host buffer
np.copyto(host_inputs[0], batch_input_image.ravel())
start = time.time()
# Transfer input data to the GPU.
cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
# Run inference.
context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
# Transfer predictions back from the GPU.
cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
# Synchronize the stream
stream.synchronize()
end = time.time()
# Remove any context from the top of the context stack, deactivating it.
self.ctx.pop()
# Here we use the first row of output in that batch_size = 1
print("host_outputs: ", host_outputs)
print("host_outputs[0].shape: ", host_outputs[0].shape)
output = host_outputs[0]
# Do postprocess
for i in range(self.batch_size):
print("i: ",i)
result_boxes, result_scores, result_classid = self.post_process(
output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
)
# Draw rectangles and labels on the original image
for j in range(len(result_boxes)):
box = result_boxes[j]
plot_one_box(
box,
batch_image_raw[i],
label="{}:{:.2f}".format(
categories[int(result_classid[j])], result_scores[j]
),
)
print("result_boxes.shape: ", result_boxes.shape)
print("result_scores.shape: ",result_scores.shape)
print("result_classid: ",result_classid.shape)
return batch_image_raw, end - start
What caught my attention is you are overwriting self.inp_handle
reference instead of copying the preprocessed image to it like so:
self.inp_handle[:] = cp.asarray(image)
Also, the preprocessing step of YOLOv5 looks identical to this repo when you set LETTERBOX=True
. Inference step should also work. Not sure about postprocessing since you did not show their function. The layout of YOLOv5’s output can be different.
Thank you for replying. Their postprocessing function is as follows:
def post_process(self, output, origin_h, origin_w):
"""
description: postprocess the prediction
param:
output: A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...]
origin_h: height of original image
origin_w: width of original image
return:
result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
result_scores: finally scores, a tensor, each element is the score correspoing to box
result_classid: finally classid, a tensor, each element is the classid correspoing to box
"""
# Get the num of boxes detected
num = int(output[0])
# Reshape to a two dimentional ndarray
pred = np.reshape(output[1:], (-1, 6))[:num, :]
# to a torch Tensor
pred = torch.Tensor(pred).cuda()
# Get the boxes
boxes = pred[:, :4]
# Get the scores
scores = pred[:, 4]
# Get the classid
classid = pred[:, 5]
# Choose those boxes that score > CONF_THRESH
si = scores > CONF_THRESH
boxes = boxes[si, :]
scores = scores[si]
classid = classid[si]
# Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
boxes = self.xywh2xyxy(origin_h, origin_w, boxes)
# Do nms
indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu()
result_boxes = boxes[indices, :].cpu()
result_scores = scores[indices].cpu()
result_classid = classid[indices].cpu()
return result_boxes, result_scores, result_classid
You need to modify filter_dets() too because you do not need to multiply to get detection score/confidence anymore.
@umairjavaid Have you done already? Could I consult your repo?
Apologies in advance, im new to tensorrt. Therefore I am seeking your help.
I am trying to integrate yolov5 model in your pipeline. I have made a separate class for yolov5 in yolov5.py. I have set the plugin path and reshaped the output tensor shape in the postprocess function accordingly. I have also written my own preprocess and postprocess function in YoloDetector. But unfortunately, my model returns all zeros in model output.
In the function infer_async, my code goes to second if statement i.e
self.engine.has_implicit_batch_dimension:
In this if conditionself.context.execute_async(batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.ptr)
gets called. Correct me, if I am wrong but I think my inputs are not being copied to GPU here. Therefore, I am getting all zeros from my model output. How can I fix it?yolov5 class added in yolov5.py
preprocess and postprocess in detector.py