Confidence difference between detect and detect_image

If you run the script below, you'll see there is a noticeable difference between the confidence result for the last horse. Here are the results that I got:

detect image results: ('horse', 0.9983882308006287) ('horse', 0.9966017007827759) ('horse', 0.9061697125434875) ('horse', 0.8582225441932678)

detect results: ('horse', 0.9983503818511963) ('horse', 0.996757447719574) ('horse', 0.9087362885475159) ('horse', 0.8728268146514893)

And here's the result from running the same configuration via the command line: ./darknet detector test ./cfg/coco.data ./cfg/yolov3.cfg ./yolov3.weights data/horses.jpg command line results: horse: 87% horse: 100% horse: 91% horse: 100%

What is causing detect_image to return different confidence results compared to the command line and detect? Detect_image is being used similary to how it's being called in darknet_video.py and detect is being used similarly to how it's called in darknet.py.

Feel free to test this out with other images. For example, the baseball bat in this image has a two percent difference: https://www.thecompleteuniversityguide.co.uk/media/4928283/istock-949190756.jpg


from ctypes import *
import math
import random
import os
import cv2
import numpy as np
import time
import darknet

def convertBack(x, y, w, h):
    xmin = int(round(x - (w / 2)))
    xmax = int(round(x + (w / 2)))
    ymin = int(round(y - (h / 2)))
    ymax = int(round(y + (h / 2)))
    return xmin, ymin, xmax, ymax

def cvDrawBoxes(detections, img):
    for detection in detections:
        x, y, w, h = detection[2][0],\
            detection[2][1],\
            detection[2][2],\
            detection[2][3]
        xmin, ymin, xmax, ymax = convertBack(
            float(x), float(y), float(w), float(h))
        pt1 = (xmin, ymin)
        pt2 = (xmax, ymax)
        cv2.rectangle(img, pt1, pt2, (0, 255, 0), 1)
        cv2.putText(img,
                    detection[0].decode() +
                    " [" + str(round(detection[1] * 100, 2)) + "]",
                    (pt1[0], pt1[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5,
                    [0, 255, 0], 2)
    return img

def printCondifence(detections):
    for detection in detections:
        print(detection[0], detection[1])

netMain = None
metaMain = None
altNames = None

def YOLO():

    global metaMain, netMain, altNames
    configPath = "./cfg/yolov3.cfg"
    weightPath = "./yolov3.weights"
    metaPath = "./cfg/coco.data"
    if not os.path.exists(configPath):
        raise ValueError("Invalid config path `" +
                         os.path.abspath(configPath)+"`")
    if not os.path.exists(weightPath):
        raise ValueError("Invalid weight path `" +
                         os.path.abspath(weightPath)+"`")
    if not os.path.exists(metaPath):
        raise ValueError("Invalid data file path `" +
                         os.path.abspath(metaPath)+"`")
    if netMain is None:
        netMain = darknet.load_net_custom(configPath.encode(
            "ascii"), weightPath.encode("ascii"), 0, 1)  # batch size = 1
    if metaMain is None:
        metaMain = darknet.load_meta(metaPath.encode("ascii"))
    if altNames is None:
        try:
            with open(metaPath) as metaFH:
                metaContents = metaFH.read()
                import re
                match = re.search("names *= *(.*)$", metaContents,
                                  re.IGNORECASE | re.MULTILINE)
                if match:
                    result = match.group(1)
                else:
                    result = None
                try:
                    if os.path.exists(result):
                        with open(result) as namesFH:
                            namesList = namesFH.read().strip().split("\n")
                            altNames = [x.strip() for x in namesList]
                except TypeError:
                    pass
        except Exception:
            pass
    #cap = cv2.VideoCapture(0)

    print("Starting the YOLO loop...")

    # Create an image we reuse for each detect
    darknet_image = darknet.make_image(darknet.network_width(netMain),
                                    darknet.network_height(netMain),3)

    imagePath = "data/horses.jpg"
    frame_read = cv2.imread(imagePath)
    frame_rgb = cv2.cvtColor(frame_read, cv2.COLOR_BGR2RGB)
    frame_resized = cv2.resize(frame_rgb,
                               (darknet.network_width(netMain),
                                darknet.network_height(netMain)),
                               interpolation=cv2.INTER_LINEAR)

    darknet.copy_image_from_bytes(darknet_image,frame_resized.tobytes())

    detections = darknet.detect_image(netMain, metaMain, darknet_image, thresh=0.25)
    print("detect image results")
    printCondifence(detections)

    detections = darknet.detect(netMain, metaMain, imagePath.encode("ascii"), thresh=0.25)
    print("detect results")
    printCondifence(detections)

if __name__ == "__main__":
    YOLO()

AlexeyAB / darknet

Confidence difference between detect and detect_image #4110