hailo-ai / Hailo-Application-Code-Examples

MIT License
72 stars 37 forks source link

Yolov8 output boxes not making any sense. #320

Closed thePrimeTux closed 6 days ago

thePrimeTux commented 1 week ago

I’m using a custom trained yolov8n model. Running the same model using the gstreamer pipeline provided in hailo-rpi5-examples runs just fine. But running the same model using the below code gives random bounding boxes. Does infer_results require any additional postprocessing? What am I missing? Any help would be appreciated.

hailortcli parse-hef yolov8n.hef gave the following output

Architecture HEF was compiled for: HAILO8L Network group name: yolov8n, Multi Context - Number of contexts: 2 Network name: yolov8n/yolov8n VStream infos: Input yolov8n/input_layer1 UINT8, NHWC(640x640x3) Output yolov8n/yolov8_nms_postprocess FLOAT32, HAILO NMS(number of classes: 10, maximum bounding boxes per class: 100, maximum frame size: 20040) Operation: Op YOLOV8 Name: YOLOV8-Post-Process Score threshold: 0.200 IoU threshold: 0.70 Classes: 10 Cross classes: false Max bboxes per class: 100 Image height: 640 Image width: 640

Below is the code I ran

import cv2
import os, random, time
import numpy as np
from hailo_platform import (HEF, Device, VDevice, HailoStreamInterface, InferVStreams, ConfigureParams,
    InputVStreamParams, OutputVStreamParams, InputVStreams, OutputVStreams, FormatType)

INPUT_RES_H = 640
INPUT_RES_W = 640

hef_path = 'yolov8n.hef'
video_file = "test.mp4"
hef = HEF(hef_path)

devices = Device.scan()

with VDevice(device_ids=devices) as target:
        configure_params = ConfigureParams.create_from_hef(hef, interface=HailoStreamInterface.PCIe)
        network_group = target.configure(hef, configure_params)[0]
        network_group_params = network_group.create_params()
        input_vstream_info = hef.get_input_vstream_infos()[0]
        output_vstream_info = hef.get_output_vstream_infos()[0]
        input_vstreams_params = InputVStreamParams.make_from_network_group(network_group, quantized=False, format_type=FormatType.FLOAT32)
        output_vstreams_params = OutputVStreamParams.make_from_network_group(network_group, quantized=False, format_type=FormatType.FLOAT32)
        height, width, channels = hef.get_input_vstream_infos()[0].shape

        source = 'camera'
        cap = cv2.VideoCapture(video_file)

        # check if the camera was opened successfully
        if not cap.isOpened():
            print("Could not open camera")
            exit()

        start_time = time.time()
        frame_count = 0

        while True:
            # read a frame from the video source
            ret, frame = cap.read()

            # check if the frame was successfully read
            if not ret:
                print("Could not read frame")
                break
            frame_count += 1
            # Get height and width from capture
            orig_w = cap.get(cv2.CAP_PROP_FRAME_WIDTH)  
            orig_h = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)        

            # loop if video source
            if source == 'video' and not cap.get(cv2.CAP_PROP_POS_FRAMES) % cap.get(cv2.CAP_PROP_FRAME_COUNT):
                cap.set(cv2.CAP_PROP_POS_FRAMES, 0)

            # resize image for yolox_s_leaky input resolution and infer it
            resized_img = cv2.resize(frame, (INPUT_RES_H, INPUT_RES_W), interpolation = cv2.INTER_AREA)
            with InferVStreams(network_group, input_vstreams_params, output_vstreams_params) as infer_pipeline:
                input_data = {input_vstream_info.name: np.expand_dims(np.asarray(resized_img), axis=0).astype(np.float32)}    
                with network_group.activate(network_group_params):
                    infer_results = infer_pipeline.infer(input_data)

            print(infer_results['yolov8n/yolov8_nms_postprocess'])  
            print("===========================")   
            for key in infer_results.keys():
                for cls, results in enumerate(infer_results[key][0]):
                    for x,y,w,h,conf in results:
                        if conf > 0.5:
                            x *= orig_w
                            y *= orig_h
                            w *= orig_w
                            h *= orig_h
                            cv2.rectangle(frame, (int(x-w/2),int(y-h/2)), (int(x+w/2),int(y+h/2)), (0,0,255), 2)

            cv2.imshow('frame',frame)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                cv2.destroyAllWindows()
                break
            avg_fps = frame_count / (time.time() - start_time)
            print(f"FPS = {avg_fps}")
thePrimeTux commented 6 days ago

Turns out I was drawing it wrong. Using the below function solved my problem.

COLORS = np.random.randint(0, 255, size=(90, 3), dtype=np.uint8)
labels = ['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
def get_label(class_id):
    global labels

    return labels[class_id-1]

def draw_detection(image, d, c, color, scale_factor_x, scale_factor_y):
    """Draw box and label for 1 detection."""
    label = get_label(c)
    ymin, xmin, ymax, xmax = d
    # Scale coordinates
    xmin, xmax = int(xmin * scale_factor_x), int(xmax * scale_factor_x)
    ymin, ymax = int(ymin * scale_factor_y), int(ymax * scale_factor_y)

    # Draw rectangle
    cv2.rectangle(image, (xmin, ymin), (xmax, ymax), color, 2)

    # Draw label text
    label_position = (xmin + 5, ymin + 15)
    cv2.putText(image, label, label_position, cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1, cv2.LINE_AA)
    return label

def annotate_image(image, results, thr=0.45, dim=640, offset_background=True):
    global COLORS
    oh, ow, _ = image.shape
    rh, rw = oh / dim, ow / dim

    for idx, class_detections in enumerate(results[list(results.keys())[0]][0]):
        if class_detections.shape[0] > 0:
            color = tuple(int(c) for c in COLORS[idx])
            for det in class_detections:
                if det[4] > thr:
                    if offset_background:
                        label = draw_detection(image, det[0:4] * dim, idx + 1, color, rw, rh)
                    else:
                        label = draw_detection(image, det[0:4] * dim, idx, color, rw, rh)