Integrate custom YOLOv5

umairjavaid commented 3 years ago

Apologies in advance, im new to tensorrt. Therefore I am seeking your help.

I am trying to integrate yolov5 model in your pipeline. I have made a separate class for yolov5 in yolov5.py. I have set the plugin path and reshaped the output tensor shape in the postprocess function accordingly. I have also written my own preprocess and postprocess function in YoloDetector. But unfortunately, my model returns all zeros in model output.

In the function infer_async, my code goes to second if statement i.e self.engine.has_implicit_batch_dimension: In this if condition self.context.execute_async(batch_size=self.batch_size, bindings=self.bindings, stream_handle=self.stream.ptr) gets called. Correct me, if I am wrong but I think my inputs are not being copied to GPU here. Therefore, I am getting all zeros from my model output. How can I fix it?

yolov5 class added in yolov5.py

class YOLOv5(YOLO):
    ENGINE_PATH = Path("/workspace/trtx-fastmot/tensorrtx/yolov5/build/yolov5s.engine")
    #print(Path("/workspace/trtx-fastmot/tensorrtx/yolov5/build/yolov5s.engine").exists())
    #MODEL_PATH = Path(__file__).parent /  'yolov4_crowdhuman.onnx'
    NUM_CLASSES = 80
    INPUT_SHAPE = (3, 640, 640)
    LAYER_FACTORS = [8, 16, 32]
    SCALES = [1.2, 1.1, 1.05]
    ANCHORS = [[11,22, 24,60, 37,116],
               [54,186, 69,268, 89,369],
               [126,491, 194,314, 278,520]]

preprocess and postprocess in detector.py

def _preprocess(self, frame):
        image_raw = frame
        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        input_w = 640
        input_h = 640
        # Calculate widht and height and paddings
        r_w = input_w / w
        r_h = input_h / h
        if r_h > r_w:
            tw = input_w
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((input_h - th) / 2)
            ty2 = input_h - th - ty1
        else:
            tw = int(r_h * w)
            th = input_h
            tx1 = int((input_w - tw) / 2)
            tx2 = input_w - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        #print("image: ",image)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image).ravel()
        print("@@@@@@@@@@@@ \n image: ",image)
        with self.backend.stream:
            self.inp_handle = cp.asarray(image)
            print("self.inp_handle: ", self.inp_handle)

    def postprocess(self):
        """Synchronizes, applies postprocessing, and returns a record array
        of detections (DET_DTYPE).
        This function should be called after `detect_async`.
        Detections with the same labels have consecutive indices.
        """
        det_out = self.backend.synchronize()
        num = int(det_out[0][0])
        det_out = np.reshape(det_out[0][1:], (-1, 6))[:num, :]

        detections = self._filter_dets(det_out, self.upscaled_sz, self.class_ids, self.conf_thresh,
                                       self.nms_thresh, self.max_area, self.min_aspect_ratio,
                                       self.bbox_offset)
        detections = np.fromiter(detections, DET_DTYPE, len(detections)).view(np.recarray)
        return detections

umairjavaid commented 3 years ago

This is how the yolov5 tensorrt model is implemented in the tensorrtx repo. Can you help me out how can I integrate this your code? Also, if possible, can you point out the difference in implementation in your and their code?

class YoLov5TRT(object):
    """
    description: A YOLOv5 class that warps TensorRT ops, preprocess and postprocess ops.
    """

    def __init__(self, engine_file_path):
        # Create a Context on this device,
        self.ctx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            print('bingding:', binding, engine.get_binding_shape(binding))
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                self.input_w = engine.get_binding_shape(binding)[-1]
                self.input_h = engine.get_binding_shape(binding)[-2]
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings
        self.batch_size = engine.max_batch_size

    def infer(self, raw_image_generator):
        threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.ctx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        batch_image_raw = []
        batch_origin_h = []
        batch_origin_w = []
        batch_input_image = np.empty(shape=[self.batch_size, 3, self.input_h, self.input_w])
        for i, image_raw in enumerate(raw_image_generator):
            input_image, image_raw, origin_h, origin_w = self.preprocess_image(image_raw)
            batch_image_raw.append(image_raw)
            batch_origin_h.append(origin_h)
            batch_origin_w.append(origin_w)
            np.copyto(batch_input_image[i], input_image)
        print("batch_input_image.shape: ",batch_input_image.shape)
        cv2.imshow("batch_input_image.shape", batch_input_image.shape)
        batch_input_image = np.ascontiguousarray(batch_input_image)

        # Copy input image to host buffer
        np.copyto(host_inputs[0], batch_input_image.ravel())
        start = time.time()
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(batch_size=self.batch_size, bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        end = time.time()
        # Remove any context from the top of the context stack, deactivating it.
        self.ctx.pop()
        # Here we use the first row of output in that batch_size = 1
        print("host_outputs: ", host_outputs)
        print("host_outputs[0].shape: ", host_outputs[0].shape)
        output = host_outputs[0]
        # Do postprocess
        for i in range(self.batch_size):
            print("i: ",i)
            result_boxes, result_scores, result_classid = self.post_process(
                output[i * 6001: (i + 1) * 6001], batch_origin_h[i], batch_origin_w[i]
            )
            # Draw rectangles and labels on the original image
            for j in range(len(result_boxes)):
                box = result_boxes[j]
                plot_one_box(
                    box,
                    batch_image_raw[i],
                    label="{}:{:.2f}".format(
                        categories[int(result_classid[j])], result_scores[j]
                    ),
                )
        print("result_boxes.shape: ", result_boxes.shape)
        print("result_scores.shape: ",result_scores.shape)
        print("result_classid: ",result_classid.shape)

        return batch_image_raw, end - start

GeekAlexis commented 3 years ago

What caught my attention is you are overwriting self.inp_handle reference instead of copying the preprocessed image to it like so:

self.inp_handle[:] = cp.asarray(image)

Also, the preprocessing step of YOLOv5 looks identical to this repo when you set LETTERBOX=True. Inference step should also work. Not sure about postprocessing since you did not show their function. The layout of YOLOv5’s output can be different.

umairjavaid commented 3 years ago

Thank you for replying. Their postprocessing function is as follows:

def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a tensor, each element is the score correspoing to box
            result_classid: finally classid, a tensor, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # to a torch Tensor
        pred = torch.Tensor(pred).cuda()
        # Get the boxes
        boxes = pred[:, :4]
        # Get the scores
        scores = pred[:, 4]
        # Get the classid
        classid = pred[:, 5]
        # Choose those boxes that score > CONF_THRESH
        si = scores > CONF_THRESH
        boxes = boxes[si, :]
        scores = scores[si]
        classid = classid[si]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes = self.xywh2xyxy(origin_h, origin_w, boxes)
        # Do nms
        indices = torchvision.ops.nms(boxes, scores, iou_threshold=IOU_THRESHOLD).cpu()
        result_boxes = boxes[indices, :].cpu()
        result_scores = scores[indices].cpu()
        result_classid = classid[indices].cpu()
        return result_boxes, result_scores, result_classid

GeekAlexis commented 3 years ago

You need to modify filter_dets() too because you do not need to multiply to get detection score/confidence anymore.

tgbaoo commented 9 months ago

@umairjavaid Have you done already? Could I consult your repo?

GeekAlexis / FastMOT

Integrate custom YOLOv5 #184