ultralytics / yolov5

YOLOv5 🚀 in PyTorch > ONNX > CoreML > TFLite
https://docs.ultralytics.com
GNU Affero General Public License v3.0
50.74k stars 16.34k forks source link

Same object is detected twice with different classes #1097

Closed awsaf49 closed 4 years ago

awsaf49 commented 4 years ago

🐛 Bug

It seems that due to some reason the same objeect is detected twice with difference classes. Which is creating lots of Falses Positive. Is there anyway to deal with this problem?

glenn-jocher commented 4 years ago

Update multi_label parameter here as you see fit.

Also do not use exclamation marks in issue titles.

https://github.com/ultralytics/yolov5/blob/d11504aee1679d8bb9ca3aef27d96643b90b70eb/utils/general.py#L605

awsaf49 commented 4 years ago

Thank you for your response, I have already tried but still same object is detected more than once with difference classes. Could you please me what does the agnostic_nms do?

mintu07ruet commented 3 years ago

Hi glenn-jocher@ when I am running yolov5 to detect only bicycle from continuous video, same bicycle is continuously detecting several times but I want to detect just one time not several time, could you please tell me, how can I do that? Thanks!

glenn-jocher commented 3 years ago

@mintu07ruet I don't understand what you are asking. Best practices is to include as much information as possible in your question, provide examples, screenshots, results, references etc.

mintu07ruet commented 3 years ago

Hi @ glenn-jocher

Thank you for your response. Ok. let me explain more details. I was detecting bicycle only at intersection of road from video file, when a bicyclist is crossing the road, the yolov5p5 is continuously detecting the same bicyclist several times. I have outputs in csv where I am getting a same bicyclists detected several times as shown with yellow color in the excel image. I want to detect just first time, not more than one for a same bicyclists. If see below two screen shot then you can see same bicyclist (same person) is detecting two times in some time gap. I want to detect this just once, not multiple times.

image image

image

Here is the modified detect file. Please suggest me where should i change? Thanks!

YOLOv5 🚀 by Ultralytics, GPL-3.0 license

""" Run inference on images, videos, directories, streams, etc.

Usage: $ python path/to/detect.py --source path/to/img.jpg --weights yolov5s.pt --img 640 """

import argparse import sys import os import time from csv import DictWriter from pathlib import Path

import cv2 import torch import torch.backends.cudnn as cudnn from numpy import random import numpy as np from models.experimental import attempt_load from utils.datasets import LoadStreams, LoadImages from utils.general import check_img_size, non_max_suppression, apply_classifier, scale_coords, xyxy2xywh, \ strip_optimizer, set_logging, increment_path from utils.plots import plot_one_box from utils.torch_utils import select_device, load_classifier, time_synchronized

def frames_to_timestamp(frame_num, fps): total_seconds = frame_num / fps mins, secs = divmod(total_seconds, 60) hours, mins = divmod(mins, 60) return "{}:{}:{}".format(int(hours), int(mins), secs)

def detect(save_img=False): source, weights, view_img, save_txt, imgsz = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith( ('rtsp://', 'rtmp://', 'http://'))

fps = None

# Directories
save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))  # increment run
(save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

# Initialize
set_logging()
device = select_device(opt.device)
half = device.type != 'cpu'  # half precision only supported on CUDA

# Load model
model = attempt_load(weights, map_location=device)  # load FP32 model
imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
if half:
    model.half()  # to FP16

# Second-stage classifier
classify = False
if classify:
    modelc = load_classifier(name='resnet101', n=2)  # initialize
    modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']).to(device).eval()

# Set Dataloader
vid_path, vid_writer = None, None
if webcam:
    view_img = True
    cudnn.benchmark = True  # set True to speed up constant image size inference
    dataset = LoadStreams(source, img_size=imgsz)
else:
    save_img = True
    # This below code will check if the specified source is a file. If so, it will assign
    # fps a number.
    source_path = Path(source)
    if source_path.is_file():
        vid = cv2.VideoCapture(str(source_path.absolute()))
        fps = vid.get(cv2.CAP_PROP_FPS)
        print ("FPS of Video Before Data Loading : {0}".format(fps))
        vid.release()
    #################################################
    dataset = LoadImages(source, img_size=imgsz)

# Get names and colors
names = model.module.names if hasattr(model, 'module') else model.names
colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

# Run inference
with open('log_file.csv', 'a') as csvfile:
    header = (
    'Timestamp', 'Frame', 'Class', 'AutoSteer Confidence', 'Indicator', 'Indicator Confidence', 'ProPilot',
    'ProPilot Confidence', 'VID - Lanes Detection Status', 'VID LDS Confidence', 'VID - LDW',
    'VID LDW Confidence')
    csv_writer = DictWriter(csvfile, fieldnames=header, lineterminator='\n', delimiter=',')
    csv_writer.writeheader()

t0 = time.time()
img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
_ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once

for path, img, im0s, vid_cap in dataset:
    img = torch.from_numpy(img).to(device)
    img = img.half() if half else img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)

    # Inference
    t1 = time_synchronized()
    pred = model(img, augment=opt.augment)[0]

    # Apply NMS
    pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
    t2 = time_synchronized()

    # Apply Classifier
    if classify:
        pred = apply_classifier(pred, modelc, img, im0s)

    # Process detections
    for i, det in enumerate(pred):  # detections per image
        if webcam:  # batch_size >= 1
            p, s, im0, frame = Path(path[i]), '%g: ' % i, im0s[i].copy(), dataset.count
        else:
            p, s, im0, frame = Path(path), '', im0s, getattr(dataset, 'frame', 0)

        save_path = str(save_dir / p.name)
        txt_path = str(save_dir / 'labels' / p.stem) + ('_%g' % dataset.frame if dataset.mode == 'video' else '')
        s += '%gx%g ' % img.shape[2:]  # print string
        gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
        if len(det):
            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()

            # Print results
            for c in det[:, -1].unique():
                n = (det[:, -1] == c).sum()  # detections per class
                s += '%g %ss, ' % (n, names[int(c)])  # add to string

            # Write results
            tl1 = cnf1 = tl2 = cnf2 = tl3 = cnf3 = tl4 = cnf4 = tl5 = cnf5 = ''
            for *xyxy, conf, cls in reversed(det):
                if save_img or view_img:  # Add bbox to image
                    label = '%s %.2f' % (names[int(cls)], conf)
                    plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=3)

                if save_txt:
                    icls = int(cls)
                    if icls == 0 or icls == 1:
                        tl1 = int(cls)
                        cnf1 = '%.2f' % (conf)
                    if icls == 2 or icls == 3:
                        tl2 = int(cls)
                        cnf2 = '%.2f' % (conf)
                    if icls == 4 or icls == 5 or icls == 6:
                        tl3 = int(cls)
                        cnf3 = '%.2f' % (conf)
                    if icls == 7 or icls == 8:
                        tl4 = int(cls)
                        cnf4 = '%.2f' % (conf)
                    if icls == 9 or icls == 10:
                        tl5 = int(cls)
                        cnf5 = '%.2f' % (conf)

            with open('log_file.csv', 'a') as csvfile:
                csv_writer = DictWriter(csvfile, fieldnames=header, lineterminator='\n', delimiter=',')
                csv_writer.writerow(
                    {'Timestamp': frames_to_timestamp(frame, fps), 'Frame': frame, 'Class': tl1,
                     'AutoSteer Confidence': cnf1, 'Indicator': tl2,
                     'Indicator Confidence': cnf2, 'ProPilot': tl3, 'ProPilot Confidence': cnf3,
                     'VID - Lanes Detection Status': tl4, 'VID LDS Confidence': cnf4, 'VID - LDW': tl5,
                     'VID LDW Confidence': cnf5})

        # Print time (inference + NMS)
        print('%sDone. (%.3fs)' % (s, t2 - t1))

        # Stream results
        if view_img:
            cv2.imshow(str(p), im0)
            if cv2.waitKey(1) == ord('q'):  # q to quit
                raise StopIteration

        # Save results (image with detections)
        if save_img:
            if dataset.mode == 'images':
                cv2.imwrite(save_path, im0)
            else:
                if vid_path != save_path:  # new video
                    vid_path = save_path
                    if isinstance(vid_writer, cv2.VideoWriter):
                        vid_writer.release()  # release previous video writer

                    fourcc = 'mp4v'  # output video codec
                    fps = vid_cap.get(cv2.CAP_PROP_FPS)
                    print(f'FPS of video {fps}')
                    w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                    h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                    vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h))
                vid_writer.write(im0)

if save_txt or save_img:
    s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
    print(f"Results saved to {save_dir}{s}")

print('Done. (%.3fs)' % (time.time() - t0))

if name == 'main': parser = argparse.ArgumentParser() parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)') parser.add_argument('--source', type=str, default='data/images', help='source') # file/folder, 0 for webcam parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)') parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold') parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--view-img', action='store_true', help='display results') parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3') parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') parser.add_argument('--augment', action='store_true', help='augmented inference') parser.add_argument('--update', action='store_true', help='update all models') parser.add_argument('--project', default='runs/detect', help='save results to project/name') parser.add_argument('--name', default='exp', help='save results to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') opt = parser.parse_args() print(opt)

with torch.no_grad():
    if opt.update:  # update all models (to fix SourceChangeWarning)
        for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']:
            detect()
            strip_optimizer(opt.weights)
    else:
        detect()
glenn-jocher commented 3 years ago

@mintu07ruet thanks for the details! YOLOv5 is just a detector, it does not track. To implement tracking in between detection frames you might want to look into Kalman Filters (i.e. EKF), KLT trackers (which use pyramid image patches), or other methods like deepsort, which has been implemented with YOLOv5, i.e.: https://github.com/mikel-brostrom/Yolov5_DeepSort_Pytorch

mintu07ruet commented 3 years ago

Hi glenn-jocher, Thank you so much for sharing this. I will work on this things!! Appreciate!!

oes5756 commented 2 years ago

@mintu07ruet I'm curious about how to print it out using the Excel file used in yolov5. Can you share the Excel output timestamp code?

mintu07ruet commented 2 years ago

YOLOv5 🚀 by Ultralytics, GPL-3.0 license

""" Run inference on images, videos, directories, streams, etc.

Usage: $ python path/to/detect.py --source path/to/img.jpg --weights yolov5s.pt --img 640 """

import argparse import sys import os import time from csv import DictWriter from pathlib import Path

import cv2 import torch import torch.backends.cudnn as cudnn from numpy import random import numpy as np from models.experimental import attempt_load from utils.datasets import LoadStreams, LoadImages from utils.general import check_img_size, non_max_suppression, apply_classifier, scale_coords, xyxy2xywh, \ strip_optimizer, set_logging, increment_path from utils.plots import plot_one_box from utils.torch_utils import select_device, load_classifier, time_synchronized

If an object is detected, the loop will skip detection on the next X seconds of frames.

SECONDS_DELAY_AFTER_DETECTION = 2

def frames_to_timestamp(frame_num, fps): """Converts the current frame number into a timestamp based on the video's fps. Anything over 24 hours is displayed accurately. Example for 100 hour timestamp: 100:00:00"""

total_seconds = frame_num / fps
mins, secs = divmod(total_seconds, 60)
hours, mins = divmod(mins, 60)
return "{}:{}:{}".format(int(hours), int(mins), secs)

def detect(save_img=False): source, weights, view_img, save_txt, imgsz = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith( ('rtsp://', 'rtmp://', 'http://'))

fps = None
last_frame_detected = None

# Directories
save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))  # increment run
(save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

# Initialize
set_logging()
device = select_device(opt.device)
half = device.type != 'cpu'  # half precision only supported on CUDA

# Load model
model = attempt_load(weights, map_location=device)  # load FP32 model
imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
if half:
    model.half()  # to FP16

# Second-stage classifier
classify = False
if classify:
    modelc = load_classifier(name='resnet101', n=12)  # initialize
    modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']).to(device).eval()

# Set Dataloader
vid_path, vid_writer = None, None
if webcam:
    view_img = True
    cudnn.benchmark = True  # set True to speed up constant image size inference
    dataset = LoadStreams(source, img_size=imgsz)
else:
    save_img = True
    # This below code will check if the specified source is a file. If so, it will assign
    # fps a number.
    source_path = Path(source)
    if source_path.is_file():
        vid = cv2.VideoCapture(str(source_path.absolute()))
        fps = vid.get(cv2.CAP_PROP_FPS)
        last_frame_detected = (SECONDS_DELAY_AFTER_DETECTION*fps)*-1
        print("FPS of Video Before Data Loading : {0}".format(fps))
        print(f"INITIAL LASTFRAMEDETECTED VALUE: {last_frame_detected}")
        vid.release()
    #################################################
    dataset = LoadImages(source, img_size=imgsz)

# Get names and colors
names = model.module.names if hasattr(model, 'module') else model.names
colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

# Run inference
with open(f"{save_dir / 'log_file.csv'}", 'a') as csvfile:
    header = (
        'Timestamp', 'Frame', 'Class')
    csv_writer = DictWriter(csvfile, fieldnames=header, lineterminator='\n', delimiter=',')
    csv_writer.writeheader()

t0 = time.time()
img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
_ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once

for path, img, im0s, vid_cap in dataset:
    img = torch.from_numpy(img).to(device)
    img = img.half() if half else img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)

    # Inference
    t1 = time_synchronized()
    pred = model(img, augment=opt.augment)[0]

    # Apply NMS
    pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
    t2 = time_synchronized()

    # Apply Classifier
    if classify:
        pred = apply_classifier(pred, modelc, img, im0s)

    # Process detections
    for i, det in enumerate(pred):  # detections per image
        if webcam:  # batch_size >= 1
            p, s, im0, frame = Path(path[i]), '%g: ' % i, im0s[i].copy(), dataset.count
        else:
            p, s, im0, frame = Path(path), '', im0s, getattr(dataset, 'frame', 0)

        save_path = str(save_dir / p.name)
        txt_path = str(save_dir / 'labels' / p.stem) + ('_%g' % dataset.frame if dataset.mode == 'video' else '')
        s += '%gx%g ' % img.shape[2:]  # print string
        gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
        if len(det) and ((frame - (last_frame_detected + (SECONDS_DELAY_AFTER_DETECTION * fps))) >= 0):
            # if any bicycle is detected, its frame number is recorded.
            last_frame_detected = frame

            # Rescale boxes from img_size to im0 size
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()

            # Print results
            for c in det[:, -1].unique():
                n = (det[:, -1] == c).sum()  # detections per class
                s += '%g %ss, ' % (n, names[int(c)])  # add to string

            # Write results
            tl1 = cnf1 = tl2 = cnf2 = tl3 = cnf3 = tl4 = cnf4 = tl5 = cnf5 = tl6 = cnf6 = ''
            for *xyxy, conf, cls in reversed(det):
                if save_img or view_img:  # Add bbox to image
                    label = '%s %.2f' % (names[int(cls)], conf)
                    plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=3)

                if save_txt:
                    icls = int(cls)
                    if icls == 0 or icls == 1:
                        tl1 = int(cls)
                        cnf1 = '%.2f' % (conf)
                    if icls == 2 or icls == 3:
                        tl2 = int(cls)
                        cnf2 = '%.2f' % (conf)
                    if icls == 4 or icls == 5 or icls == 6:
                        tl3 = int(cls)
                        cnf3 = '%.2f' % (conf)
                    if icls == 7 or icls == 8:
                        tl4 = int(cls)
                        cnf4 = '%.2f' % (conf)
                    if icls == 9 or icls == 10:
                        tl5 = int(cls)
                        cnf5 = '%.2f' % (conf)
                    if icls == 17 or icls == 36:
                        tl6 = int(cls)
                        cnf6 = '%.2f' % (conf)

            with open(f"{save_dir / 'log_file.csv'}", 'a') as csvfile:
                csv_writer = DictWriter(csvfile, fieldnames=header, lineterminator='\n', delimiter=',')
                csv_writer.writerow(
                    {'Timestamp': frames_to_timestamp(frame, fps), 'Frame': frame, 'Class': tl1,
                     })

        # Print time (inference + NMS)
        print('%sDone. (%.3fs)' % (s, t2 - t1))

        # Stream results
        if view_img:
            cv2.imshow(str(p), im0)
            if cv2.waitKey(1) == ord('q'):  # q to quit
                raise StopIteration

        # Save results (image with detections)
        if save_img:
            if dataset.mode == 'images':
                cv2.imwrite(save_path, im0)
            else:
                if vid_path != save_path:  # new video
                    vid_path = save_path
                    if isinstance(vid_writer, cv2.VideoWriter):
                        vid_writer.release()  # release previous video writer

                    fourcc = 'mp4v'  # output video codec
                    fps = vid_cap.get(cv2.CAP_PROP_FPS)
                    print(f'FPS of video {fps}')
                    w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
                    h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
                    vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*fourcc), fps, (w, h))
                vid_writer.write(im0)

if save_txt or save_img:
    s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
    print(f"Results saved to {save_dir}{s}")

print('Done. (%.3fs)' % (time.time() - t0))

if name == 'main': parser = argparse.ArgumentParser() parser.add_argument('--weights', nargs='+', type=str, default='yolov5s.pt', help='model.pt path(s)') parser.add_argument('--source', type=str, default='data/images', help='source') # file/folder, 0 for webcam parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)') parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold') parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS') parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu') parser.add_argument('--view-img', action='store_true', help='display results') parser.add_argument('--save-txt', action='store_true', help='save results to *.txt') parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels') parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3') parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS') parser.add_argument('--augment', action='store_true', help='augmented inference') parser.add_argument('--update', action='store_true', help='update all models') parser.add_argument('--project', default='runs/detect', help='save results to project/name') parser.add_argument('--name', default='exp', help='save results to project/name') parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment') opt = parser.parse_args() print(opt)

with torch.no_grad():
    if opt.update:  # update all models (to fix SourceChangeWarning)
        for opt.weights in ['yolov5s.pt', 'yolov5m.pt', 'yolov5l.pt', 'yolov5x.pt']:
            detect()
            strip_optimizer(opt.weights)
    else:
        detect()

@oes5756 , I have used above modified script to record the outcomes in excel. Thanks!