IntelRealSense / librealsense

Intel® RealSense™ SDK
https://www.intelrealsense.com/
Apache License 2.0
7.64k stars 4.83k forks source link

realsense camera run normally in yolov5s but not yolov5s-seg on Ubuntu 20.04 #12391

Closed Anthonia2001 closed 1 year ago

Anthonia2001 commented 1 year ago

Required Info
Camera Model { D400series/D435 }
Firmware Version (5.15.1)
Operating System & Version {Ubuntu 20.04 LTS
Kernel Version (Linux Only) 5.15.0-88-generic
Platform PC
SDK Version { legacy / 2.54.2 }
Language {python 3.8.10 distributed with ROS Noetic}
Segment {Robot/Smartphone/VR/AR/others}

Issue Description

Hi! I'm applying yolov5 to detect object and publish data to my ROS node. And I also install librealsense from this link: (https://github.com/IntelRealSense/librealsense/blob/master/doc/installation.md) with CMake and including the flag -DFORCE_RSUSB_BACKEND=TRUE in the CMake build instruction. And pyrealsense2 with pip install

At first, when I modified and run this code from yolov5s.pt model

import argparse
import os #ok
import sys #ok
from pathlib import Path #ok
import pyrealsense2 as rs #ok
import numpy as np #ok
import cv2 #ok
import torch #ok
import torch.backends.cudnn as cudnn #ok
import time #ok
from tkinter import ttk 
import tkinter as tk
import tkinter.font as font
import math
import threading

import rospy
from std_msgs.msg import Float64MultiArray
from std_msgs.msg import Int16

FILE = Path(__file__).resolve() 
ROOT = FILE.parents[0]  # YOLOv5 root directory
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))  # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative

from models.common import DetectMultiBackend
from utils.datasets import IMG_FORMATS, VID_FORMATS, LoadImages, LoadStreams
from utils.general import (LOGGER, check_file, check_img_size, check_imshow, check_requirements, colorstr,
                           increment_path, non_max_suppression, print_args, scale_coords, strip_optimizer, xyxy2xywh)
from utils.plots import Annotator, colors, save_one_box
from utils.torch_utils import select_device, time_sync

import threading

cv_font = cv2.FONT_HERSHEY_SIMPLEX

object_coordinates = [] #label, x, y
delete_objects = False

signal_1 = int()

def receive(msg5):
    global signal_1
    signal_1 = msg5.data
    rospy.loginfo(signal_1)

def find_plane(points):
    c = np.mean(points, axis=0)
    r0 = points - c
    u, s, v = np.linalg.svd(r0)
    nv = v[-1, :]
    ds = np.dot(points, nv)
    param = np.r_[nv, -np.mean(ds)]
    return param

#@torch.no_grad()
def run():
    global object_coordinates, delete_objects,signal_1
    weights=ROOT / 'yolov5s.pt'  # model.pt path(s)
    source=ROOT / 'data/images'  # file/dir/URL/glob, 0 for webcam
    data=ROOT / 'data/coco128.yaml'  # dataset.yaml path
    imgsz=(416, 224)  # inference size (height, width)
    conf_thres=0.25  # confidence threshold
    iou_thres=0.45  # NMS IOU threshold
    max_det=1 # maximum detections per image
    device_num=''  # cuda device, i.e. 0 or 0,1,2,3 or cpu
    view_img=False  # show results
    save_txt=False  # save results to *.txt
    save_conf=False  # save confidences in --save-txt labels
    save_crop=False  # save cropped prediction boxes
    nosave=False  # do not save images/videos
    classes=None  # filter by class: --class 0, or --class 0 2 3
    agnostic_nms=False  # class-agnostic NMS
    augment=False  # augmented inference
    visualize=False  # visualize features
    update=False  # update all models
    project=ROOT / 'runs/detect'  # save results to project/name
    name='exp'  # save results to project/name
    exist_ok=False  # existing project/name ok, do not increment
    line_thickness=3  # bounding box thickness (pixels)
    hide_labels=False  # hide labels
    hide_conf=False  # hide confidences
    half=False  # use FP16 half-precision inference
    dnn=False  # use OpenCV DNN for ONNX inference

    source = str(source)
    save_img = not nosave and not source.endswith('.txt')  # save inference images
    is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
    is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))
    webcam = source.isnumeric() or source.endswith('.txt') or (is_url and not is_file)
    if is_url and is_file:
        source = check_file(source)  # download

    #Initialize ROS node
    rospy.init_node('simple_publisher_py', anonymous=True)

    # register a publisher on the topic /chatter that will publish String messages
    pub = rospy.Publisher('coordinates_1', Float64MultiArray, queue_size=3)

    rospy.Subscriber("block", Int16, receive)

    rate = rospy.Rate(10)

    # Load model
    print("device_num : " + str(device_num))
    device = select_device(device_num)
    print("device : " + str(device))
    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data)
    stride, names, pt, jit, onnx, engine = model.stride, model.names, model.pt, model.jit, model.onnx, model.engine
    imgsz = check_img_size(imgsz, s=stride)  # check image size

    # Half
    half &= (pt or jit or engine) and device.type != 'cpu'  # half precision only supported by PyTorch on CUDA
    if pt or jit:
        model.model.half() if half else model.model.float()

    # Dataloader
    if webcam:
        view_img = check_imshow()
        cudnn.benchmark = True  # set True to speed up constant image size inference
        dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt)
        bs = len(dataset)  # batch_size
    else:
        dataset = LoadImages(source, img_size=imgsz, stride=stride, auto=pt)
        bs = 1  # batch_size
    vid_path, vid_writer = [None] * bs, [None] * bs

    # Run inference
    model.warmup(imgsz=(1, 3, *imgsz), half=half)  # warmup
    dt, seen = [0.0, 0.0, 0.0], 0

    config = rs.config() #ok
    config.enable_stream(rs.stream.color, 424, 240, rs.format.bgr8, 30) #ok
    config.enable_stream(rs.stream.depth, 424, 240, rs.format.z16, 30) #ok

    pipeline = rs.pipeline() #ok
    profile = pipeline.start(config) #ok

    align_to = rs.stream.color #ok
    align = rs.align(align_to) #ok

    intr = profile.get_stream(rs.stream.color).as_video_stream_profile().get_intrinsics() #ok

    while(not rospy.is_shutdown()):

        t0 = time.time()

        frames = pipeline.wait_for_frames()

        aligned_frames = align.process(frames)
        color_frame = aligned_frames.get_color_frame()
        depth_frame = aligned_frames.get_depth_frame()
        #ok
        if not depth_frame or not color_frame:
            continue
        #ok
        img = np.asanyarray(color_frame.get_data())
        im = img[8:232,4:420]
        depth_image = np.asanyarray(depth_frame.get_data())
        #

        # Letterbox
        im0 = im.copy()
        im = im[np.newaxis, :, :, :]        

        # Stack
        im = np.stack(im, 0)

        # Convert
        im = im[..., ::-1].transpose((0, 3, 1, 2))  # BGR to RGB, BHWC to BCHW
        im = np.ascontiguousarray(im)

        t1 = time_sync()
        im = torch.from_numpy(im).to(device)
        im = im.half() if half else im.float()  # uint8 to fp16/32
        im /= 255  # 0 - 255 to 0.0 - 1.0
        if len(im.shape) == 3:
            im = im[None]  # expand for batch dim
        t2 = time_sync()
        dt[0] += t2 - t1

        # Inference
        #visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
        pred = model(im, augment=augment, visualize=False)
        t3 = time_sync()
        dt[1] += t3 - t2

        # NMS
        pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det)
        dt[2] += time_sync() - t3

        if num_threads > 1:
            print("Multi-threaded2")
        else:
            print("Single-threaded2")
        # Process predictions
        for i, det in enumerate(pred):  # per image
            seen += 1
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            imc = im0.copy() if save_crop else im0  # for save_crop
            annotator = Annotator(im0, line_width=line_thickness, example=str(names))
            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(im.shape[2:], det[:, :4], im0.shape).round()
                delete_objects = True
                # Write results
                for *xyxy, conf, cls in reversed(det):
                    c = int(cls)  # integer class
                    label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')
                    if(label != 'dining table'):
                        annotator.box_label(xyxy, label, color=colors(c, True))
                        x = int((xyxy[0] + xyxy[2])/2)
                        y = int((xyxy[1] + xyxy[3])/2)
                        dist = depth_frame.get_distance(x + 4, y + 8)*1000
                        Xtarget = (dist*(x+4 - intr.ppx)/intr.fx - 35)/1000 #the distance from RGB camera to realsense center
                        Ytarget = (dist*(y+8 - intr.ppy)/intr.fy)/1000
                        Ztarget = dist/1000
                        object_coordinates.append([label, Xtarget, Ztarget])

                        p = ([Xtarget], [Ytarget], [Ztarget], [1])

                        w = np.dot(extrinsic, p)

                        if(signal_1 == 0):
                            rospy.loginfo(w)

                            if(w[0] > 0):
                                w[0] = w[0] + 0.0244083
                            elif (w[0]<0):
                                w[0] = w[0] - 0.0244083
                            if(w[1] > 0):
                                w[1] = w[1] + 0.0244083
                            elif (w[1]<0):
                                w[1] = w[1] - 0.0244083
                       #if(w[2] > 0):
                       #    w[2] = w[2] + 0.0244083
                        #elif (w[2]<0):
                        #       w[2] = w[2] - 0.0244083

                            msg.data.append(w[0])
                            msg.data.append(w[1])
                            msg.data.append(w[2]+0.04)

                            pub.publish(msg)

                        coordinate_text = "(" + str(round(Xtarget)) + "," + str(round(Ytarget)) + "," + str(round(Ztarget)) + ")"
                        cv2.putText(im0, text=coordinate_text, org=(int((xyxy[0] + xyxy[2])/2), int((xyxy[1] + xyxy[3])/2)),
                        fontFace = cv_font, fontScale = 0.7, color=(255,255,255), thickness=1, lineType = cv2.LINE_AA)

                    ## Objects' Orientation Estimation ##
                        annotator.box_label(xyxy, label, color=colors(c, True))
                        offset_x = int((xyxy[2] - xyxy[0])/10)
                        offset_y = int((xyxy[3] - xyxy[1])/10)
                        interval_x = int((xyxy[2] - xyxy[0] -2 * offset_x)/2)
                        interval_y = int((xyxy[3] - xyxy[1] -2 * offset_y)/2)
                        points = np.zeros([9,3]) #OG@[9,3]
                        for i in range(3): #OG@3
                            for j in range(3): #OG@3
                                x = int(xyxy[0]) + offset_x + interval_x*i
                                y = int(xyxy[1]) + offset_y + interval_y*j
                                dist = depth_frame.get_distance(x, y)*1000 #OG@*1000
                                Xtemp = dist*(x - intr.ppx)/intr.fx
                                Ytemp = dist*(y - intr.ppy)/intr.fy
                                Ztemp = dist
                                points[j+i*3][0] = Xtemp
                                points[j+i*3][1] = Ytemp
                                points[j+i*3][2] = Ztemp

                        param = find_plane(points)

                        alpha = math.atan(param[2]/param[0])*180/math.pi
                        if(alpha < 0):
                            alpha = alpha + 90
                        else:
                            alpha = alpha - 90

                        gamma = math.atan(param[2]/param[1])*180/math.pi
                        if(gamma < 0):
                            gamma = gamma + 90
                        else:
                            gamma = gamma - 90

                    ## Display XYZ parameters ##
                    coordinate_text = "(" + str(round(Xtarget)) + "," + str(round(Ytarget)) + "," + str(round(Ztarget)) + ")"
                    cv2.putText(im0, text=coordinate_text, org=(int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2) + 60),
                    fontFace = cv_font, fontScale = 0.5, color=(255,255,255), thickness=2, lineType = cv2.LINE_AA)
                    print("(" + str(round(Xtarget)) + "," + str(round(Ytarget)) + "," + str(round(Ztarget)) + ")")

                    ## Display Alpha, Gamma parameters ##
                    text1 = "alpha : " + str(round(alpha))
                    text2 = "gamma : " + str(round(gamma))
                    cv2.putText(im0, text1, (int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2)), cv2.FONT_HERSHEY_PLAIN, 1.5,(255,255,255), thickness=2, lineType=cv2.LINE_AA)
                    cv2.putText(im0, text2, (int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2) + 30), cv2.FONT_HERSHEY_PLAIN, 1.5,(255,255,255) , thickness=2, lineType=cv2.LINE_AA)
                    print("pitch : " + str(alpha) + ", roll : " + str(gamma)) 

        cv2.imshow("IMAGE", im0)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break 
        rate.sleep()

if __name__ == '__main__':
    try:
        run()
    except rospy.ROSInterruptException:
        pass

The program works without any bugs and errors, I can still open realsense depth camera and take the x,y,z value and transfer to my other ROS node.

Then, as the complex task of detection, I need to use yolov5s-seg.pt model. At begining, I run the code predict.py from the git hub source**: https://github.com/ultralytics/yolov5/blob/master/segment/predict.py. The program still works. I keep modifying this code by adding pyrealsense and pipe.start(cfg) and run this by python3 predict.py --source 4 (which is RGB channel of Realsense)

import argparse
import os
import platform
import sys
from pathlib import Path
from PIL import ImageDraw, ImageFont
import torch
import numpy as np
import time
import pyrealsense2 as rs
import math
import threading

FILE = Path(__file__).resolve()
ROOT = FILE.parents[1]  # YOLOv5 root directory
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))  # add ROOT to PATH
ROOT = Path(os.path.relpath(ROOT, Path.cwd()))  # relative

from ultralytics.utils.plotting import Annotator, colors, save_one_box
from models.common import DetectMultiBackend
from utils.dataloaders import IMG_FORMATS, VID_FORMATS, LoadImages, LoadScreenshots, LoadStreams
from utils.general import (LOGGER, Profile, check_file, check_img_size, check_imshow, check_requirements, colorstr, cv2,
                           increment_path, non_max_suppression, print_args, scale_boxes, scale_segments,
                           strip_optimizer)
from utils.plots import Annotator, colors, save_one_box
from utils.segment.general import masks2segments, process_mask, process_mask_native
from utils.torch_utils import select_device, smart_inference_mode

from math import atan2, cos, sin, sqrt, pi, asin, acos

pipe = rs.pipeline()
cfg = rs.config() #ok
cfg.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 60) #ok
cfg.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 60) #ok

pipe = rs.pipeline()
profile = pipe.start(cfg)

#pipe.start(cfg)
align_to = rs.stream.color
align = rs.align(align_to)

intr = profile.get_stream(rs.stream.color).as_video_stream_profile().get_intrinsics() #ok

def find_plane(points):
    c = np.mean(points, axis=0)
    r0 = points - c
    u, s, v = np.linalg.svd(r0)
    nv = v[-1, :]
    ds = np.dot(points, nv)
    param = np.r_[nv, -np.mean(ds)]
    return param

def drawAxis(img, p_, q_, colour, scale):
    p = list(p_)
    q = list(q_)

    angle = atan2(p[1] - q[1], p[0] - q[0])  # angle in radians
    hypotenuse = sqrt((p[1] - q[1]) * (p[1] - q[1]) + (p[0] - q[0]) * (p[0] - q[0]))
    # Here we lengthen the arrow by a factor of scale
    q[0] = p[0] - scale * hypotenuse * cos(angle)
    q[1] = p[1] - scale * hypotenuse * sin(angle)
    cv2.line(img, (int(p[0]), int(p[1])), (int(q[0]), int(q[1])), colour, 1, cv2.LINE_AA)
    # create the arrow hooks
    p[0] = q[0] + 9 * cos(angle + pi / 4)
    p[1] = q[1] + 9 * sin(angle + pi / 4)
    cv2.line(img, (int(p[0]), int(p[1])), (int(q[0]), int(q[1])), colour, 1, cv2.LINE_AA)
    p[0] = q[0] + 9 * cos(angle - pi / 4)
    p[1] = q[1] + 9 * sin(angle - pi / 4)
    cv2.line(img, (int(p[0]), int(p[1])), (int(q[0]), int(q[1])), colour, 1, cv2.LINE_AA)

def get_frame_stream():
    # cfg = rs.config() #ok
    # cfg.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 60) #ok
    # cfg.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 60) #ok

    # pipe = rs.pipeline()
    # profile = pipe.start(cfg)

    # #pipe.start(cfg)
    # align_to = rs.stream.color
    # align = rs.align(align_to)

    # intr = profile.get_stream(rs.stream.color).as_video_stream_profile().get_intrinsics()
    # Wait for a coherent pair of frames: depth and color
    frames = pipe.wait_for_frames()
    aligned_frames = align.process(frames)
    depth_frame = aligned_frames.get_depth_frame()
    color_frame = aligned_frames.get_color_frame()
    depth_frame.get_height()

    if not depth_frame or not color_frame:
        # If there is no frame, probably camera not connected, return False
        print(
            "Error, impossible to get the frame, make sure that the Intel Realsense camera is correctly connected")
        return None, None

        # Apply filter to fill the Holes in the depth image
    color_intrin = color_frame.profile.as_video_stream_profile().intrinsics

    spatial = rs.spatial_filter()
    spatial.set_option(rs.option.holes_fill, 3)
    filtered_depth = spatial.process(depth_frame)

    hole_filling = rs.hole_filling_filter()
    filled_depth = hole_filling.process(filtered_depth)

    # Create colormap to show the depth of the Objects
    colorizer = rs.colorizer()
    depth_colormap = np.asanyarray(colorizer.colorize(filled_depth).get_data())

    # Convert images to numpy arrays
    # distance = depth_frame.get_distance(int(50),int(50))
    # print("distance", distance)
    depth_image = np.asanyarray(filled_depth.get_data())
    color_image = np.asanyarray(color_frame.get_data())

    # cv2.imshow("Colormap", depth_colormap)
    # cv2.imshow("depth img", depth_image)

    return True, frames, color_image, depth_image, color_frame, depth_frame, color_intrin

def run(
    weights=ROOT / 'yolo5s-seg.pt',  # model.pt path(s)
    source=ROOT / 'data/images',  # file/dir/URL/glob/screen/0(webcam)
    data=ROOT / 'data/coco128.yaml',  # dataset.yaml path
    imgsz=(640, 640),  # inference size (height, width) #OG@(640, 640)
    conf_thres=0.35,  # confidence threshold
    iou_thres=0.45,  # NMS IOU threshold
    max_det=2,  # maximum detections per image
    device='',  # cuda device, i.e. 0 or 0,1,2,3 or cpu
    view_img=False,  # show results
    save_txt=False,  # save results to *.txt
    save_conf=False,  # save confidences in --save-txt labels
    save_crop=False,  # save cropped prediction boxes
    nosave=False,  # do not save images/videos
    classes=None,  # filter by class: --class 0, or --class 0 2 3
    agnostic_nms=True,  # class-agnostic NMS
    augment=False,  # augmented inference
    visualize=False,  # visualize features
    update=False,  # update all models
    project=ROOT / 'runs/predict-seg',  # save results to project/name
    name='exp',  # save results to project/name
    exist_ok=False,  # existing project/name ok, do not increment
    line_thickness=3,  # bounding box thickness (pixels)
    hide_labels=False,  # hide labels
    hide_conf=False,  # hide confidences
    half=False,  # use FP16 half-precision inference
    dnn=False,  # use OpenCV DNN for ONNX inference
    vid_stride=1,  # video frame-rate stride
    retina_masks=False,
    WHITE = (225, 255, 255), #color for displaying

):
    source = str(source)
    is_file = Path(source).suffix[1:] in (IMG_FORMATS + VID_FORMATS)
    is_url = source.lower().startswith(('rtsp://', 'rtmp://', 'http://', 'https://'))
    webcam = source.isnumeric() or source.endswith('.streams') or (is_url and not is_file)
    if is_url and is_file:
        source = check_file(source)  # download

    # Directories
    save_dir = increment_path(Path(project) / name, exist_ok=exist_ok)  # increment run
    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

    # Load model
    device = select_device(device)
    model = DetectMultiBackend(weights, device=device, dnn=dnn, data=data, fp16=half)
    stride, names, pt = model.stride, model.names, model.pt
    imgsz = check_img_size(imgsz, s=stride)  # check image size

    # Dataloader
    bs = 1  # batch_size
    view_img = check_imshow(warn=True)
    #cap = cv2.VideoCapture(1)
    #frame_img = cap.read()
    dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
    bs = len(dataset)
    vid_path, vid_writer = [None] * bs, [None] * bs
    #get_frame_stream()

    # Run inference
    model.warmup(imgsz=(1 if pt else bs, 3, *imgsz))  # warmup
    seen, windows, dt = 0, [], (Profile(), Profile(), Profile())
    for path, im, im0s, vid_cap, s in dataset:
        with dt[0]:
            frame_img = vid_cap
            im = torch.from_numpy(im).to(model.device)
            im = im.half() if model.fp16 else im.float()  # uint8 to fp16/32
            imnew = im.half() if model.fp16 else im.int()  # uint8 to fp16/32
            im /= 255  # 0 - 255 to 0.0 - 1.0
            if len(im.shape) == 3:
                im = im[None]  # expand for batch dim

        # Inference
        with dt[1]:
            visualize = increment_path(save_dir / Path(path).stem, mkdir=True) if visualize else False
            pred, proto = model(im, augment=augment, visualize=visualize)[:2]

        # NMS
        with dt[2]:
            pred = non_max_suppression(pred, conf_thres, iou_thres, classes, agnostic_nms, max_det=max_det, nm=32)

        # Second-stage classifier (optional)
        # pred = utils.general.apply_classifier(pred, classifier_model, im, im0s)

        # Process predictions
        for i, det in enumerate(pred):  # per image
            seen += 1
            p, im0, frame = path[i], im0s[i].copy(), dataset.count
            s += f'{i}: '

            p = Path(p)  # to Path
            save_path = str(save_dir / p.name)  # im.jpg
            txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # im.txt
            s += '%gx%g ' % im.shape[2:]  # print string
            annotator = Annotator(im0, line_width=line_thickness, example=str(names))
            if len(det):
                masks = process_mask(proto[i], det[:, 6:], det[:, :4], im.shape[2:], upsample=True)  # HWC
                #print(masks)
                det[:, :4] = scale_boxes(im.shape[2:], det[:, :4], im0.shape).round()  # rescale boxes to im0 size

                masks2 = process_mask(proto[i], det[:, 6:], det[:, :4], imnew.shape[2:], upsample=True)  # HWC
                #print(masks)
                det[:, :4] = scale_boxes(imnew.shape[2:], det[:, :4], im0.shape).round()  # rescale boxes to im0 size

                segments2 = [
                    scale_segments(im0.shape if retina_masks else imnew.shape[2:], x, im0.shape, normalize=False)
                    for x in reversed(masks2segments(masks))]
                #NumberOfElements = sizeof(arr) / sizeof(arr[0]);
                #segments2_conver = np.array(segments2, dtype=np.int32)
                #print(segments2_conver)

                segments = [
                    scale_segments(im0.shape if retina_masks else im.shape[2:], x, im0.shape, normalize=True)
                    for x in reversed(masks2segments(masks))]
                #print(segments)

                # Print results
                for c in det[:, 5].unique():
                    n = (det[:, 5] == c).sum()  # detections per class
                    s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string
                    #print(n)
                # Mask plotting
                annotator.masks(
                    masks,
                    colors=[colors(x, True) for x in det[:, 5]],
                    im_gpu=torch.as_tensor(im0, dtype=torch.float16).to(device).permute(2, 0, 1).flip(0).contiguous() /
                    255 if retina_masks else im[i])

                ret, new_frame, bgr_image, depth_image, bgr_frame, depth_frame, color_intrin = get_frame_stream()
                # Write results
                for j, (*xyxy, conf, cls) in enumerate(reversed(det[:, :6])):
                    seg = segments[j].reshape(-1)  # (n,2) to (n*2)
                    #print(segments[j])
                    line = (cls, *seg, conf) if save_conf else (cls, *seg)  # label format
                    #with open(f'{txt_path}.txt', 'a') as f:
                        #f.write(('%g ' * len(line)).rstrip() % line + '\n')
                    c = int(cls)  # integer class
                    label = None if hide_labels else (names[c] if hide_conf else f'{names[c]} {conf:.2f}')

                    annotator.box_label(xyxy, label, color=colors(c, True))
                    box = xyxy
                    centerx, centery = int((int(box[0]) + int(box[2])) / 2), int((int(box[1]) + int(box[3])) / 2)
                    center_point = centerx, centery
                    #distance_mm = depth_image[centery, centerx]
                    #print(center_point)
                    #ImageDraw.ImageDraw.polygon(, segments2[j], outline=colors(c, True), width=3)
                    #cv2.circle(vid_cap,(segments2[j]), radius=1, color=(0,0,255), thickness=1)
                    im0 = annotator.result()
                    seg_point = np.array(segments2[j], dtype=np.int32)
                    #cv2.putText(im0, "{} mm".format(distance_mm), (centerx, centery - 10), 0, 0.5, (0, 0, 255), 2)
                    #center_point_XYZ = rs.rs2_deproject_pixel_to_point(color_intrin, center_point, distance_mm)
                    #print(np.array(center_point_XYZ).astype(int))
                    print(center_point)
                    cv2.polylines(im0, [np.array(segments2[j], dtype=np.int32)], isClosed=True, color=(0,0,0), thickness=3)

                    "So this is the list of segmentation point segments2[j] "
                    #print([np.array(segments2[j], dtype=np.int32)])

                    "So we will move on to do the PCA for detecting vector X,Y for and object"
                    ""
                    ###########################################################
                    ##################### YAW ESTIMATION. #####################
                    ###########################################################

                    # Perform PCA analysis
                    mean = np.empty((0))
                    mean, eigenvectors, eigenvalues = cv2.PCACompute2(np.array(segments2[j], dtype=np.float64), mean)
                    # Store the center of the object
                    cntr = (int(mean[0, 0]), int(mean[0, 1]))

                    # Draw the principal components
                    cv2.circle(im0, cntr, 3, (255, 0, 255), 2)
                    p1 = (cntr[0] + 0.02 * eigenvectors[0, 0] * eigenvalues[0, 0],
                          cntr[1] + 0.02 * eigenvectors[0, 1] * eigenvalues[0, 0])
                    p2 = (cntr[0] - 0.02 * eigenvectors[1, 0] * eigenvalues[1, 0],
                          cntr[1] - 0.02 * eigenvectors[1, 1] * eigenvalues[1, 0])
                    drawAxis(im0, cntr, p1, (0, 255, 0), 1)
                    drawAxis(im0, cntr, p2, (255, 255, 0), 5)
                    yaw = atan2(eigenvectors[0, 1], eigenvectors[0, 0])  # orientation in radians
                    #print("Yaw: ", round(yaw, 2))

                    ##########################################################
                    ##################### SIZE ESTIMATION. ###################
                    ##########################################################

                    "So in This part we will try to extract the maximum, minimum of coordinates x,y"
                    "Remember that the highest point Y have the lowest value"
                    "Remember that the furthest point X have the highest value"

                    min_xy = np.min(seg_point, axis=0)
                    min_x , min_y = int(min_xy[0]), int(min_xy[1])

                    #index_ymin = np.where(seg_point == min_y)[0]
                    #list_ymin = index_ymin.tolist()
                    #cv2.circle(im0, index_ymin, 3, (255, 0, 0))
                    #index_xmin = np.where(seg_point == min_x)[0]
                    #cv2.circle(im0, index_xmin, 3, (255, 0, 0))

                    max_xy = np.max(seg_point, axis=0)
                    max_x, max_y = int(max_xy[0]), int(max_xy[1])

                    index_ymax = np.where(seg_point == max_y)[0]
                    index_xmax = np.where(seg_point == max_x)[0]
                    #cv2.circle(im0, index_ymax, 3, (255, 0, 0))q
                    #cv2.circle(im0, index_xmax, 3, (255, 0, 0))
                    #print(max)

                    center_point_updated = int((max_x+min_x)/2), int((max_y+min_y)/2)
                    cv2.circle(im0, center_point_updated, 3, (255, 0, 0))

                    distance_1 = depth_image[centery, min_x+5]
                    distance_2 = depth_image[centery, max_x-5]
                    distance_3 = depth_image[min_y+5, centerx]
                    distance_4 = depth_image[max_y-5, centerx]
                    distance_mm = depth_image[centery, centerx]

                    point1 = rs.rs2_deproject_pixel_to_point(color_intrin, [min_x+5, centery], distance_1)
                    point2 = rs.rs2_deproject_pixel_to_point(color_intrin, [max_x-5, centery], distance_2)
                    point3 = rs.rs2_deproject_pixel_to_point(color_intrin, [centerx, min_y+5], distance_3)
                    point4 = rs.rs2_deproject_pixel_to_point(color_intrin, [centerx, max_y-5], distance_4)
                    cnt_point = rs.rs2_deproject_pixel_to_point(color_intrin, [centerx, centery], distance_mm)

                    wide_0 = int(90)
                    high_0 = int(140)

                    wide = int(math.sqrt(math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2) +
                                         math.pow(point1[2] - point2[2], 2)))
                    high = int(math.sqrt(math.pow(point3[0] - point4[0], 2) + math.pow(point3[1] - point4[1], 2) +
                                         math.pow(point3[2] - point4[2], 2)))
                    # if wide < high:
                    #     cv2.putText(im0, "Width:{} mm".format(wide), (centerx, centery + 10), 0, 0.5, (0, 0, 255), 2)
                    #     cv2.putText(im0, "Height{} mm".format(high), (centerx, centery + 30), 0, 0.5, (0, 0, 255), 2)
                    # else:
                    #     cv2.putText(im0, "Width:{} mm".format(wide), (centerx, centery + 10), 0, 0.5, (0, 0, 255), 2)
                    #     cv2.putText(im0, "Height{} mm".format(high), (centerx, centery + 30), 0, 0.5, (0, 0, 255), 2)

                    if j == 5:
                        if wide < high:
                            wide_0 = wide
                            high_0 = high
                    cv2.circle(im0, (min_x, centery), 3, (255, 0, 0))
                    cv2.circle(im0, (max_x, centery), 3, (255, 0, 0))
                    cv2.circle(im0, (centerx, min_y), 3, (255, 0, 0))
                    cv2.circle(im0, (centerx, max_y), 3, (255, 0, 0))

                    ##########################################################
                    ################ ROLL, PITCH ESTIMATION. #################
                    ##########################################################

                    offset_x = int((xyxy[2] - xyxy[0])/10)
                    offset_y = int((xyxy[3] - xyxy[1])/10)
                    interval_x = int((xyxy[2] - xyxy[0] -2 * offset_x)/2)
                    interval_y = int((xyxy[3] - xyxy[1] -2 * offset_y)/2)
                    points = np.zeros([9,3]) #OG@[9,3]
                    for i in range(3): #OG@3
                        for j in range(3): #OG@3
                            x = int(xyxy[0]) + offset_x + interval_x*i
                            y = int(xyxy[1]) + offset_y + interval_y*j
                            dist = depth_frame.get_distance(x, y)*1000 #OG@*1000
                            Xtemp = dist*(x - intr.ppx)/intr.fx
                            Ytemp = dist*(y - intr.ppy)/intr.fy
                            Ztemp = dist
                            points[j+i*3][0] = Xtemp
                            points[j+i*3][1] = Ytemp
                            points[j+i*3][2] = Ztemp

                    param = find_plane(points)

                    alpha = math.atan(param[2]/param[0])*180/math.pi
                    if(alpha < 0):
                        alpha = alpha + 90
                    else:
                        alpha = alpha - 90

                    gamma = math.atan(param[2]/param[1])*180/math.pi
                    if(gamma < 0):
                        gamma = gamma + 90
                    else:
                        gamma = gamma - 90

                    object_coordinates = []
                    x = int((xyxy[0] + xyxy[2])/2)
                    y = int((xyxy[1] + xyxy[3])/2)
                    dist = depth_frame.get_distance(x + 4, y + 8)*1000
                    Xtarget = dist*(x+4 - intr.ppx)/intr.fx - 35 #the distance from RGB camera to realsense center
                    Ytarget = dist*(y+8 - intr.ppy)/intr.fy
                    Ztarget = dist
                    object_coordinates.append([label, Xtarget, Ztarget])

                    #####################################################
                    ############ DISPLAYING PARMETER SECTION. ########### 
                    #####################################################

                    ## Display XYZ parameters ##
                    coordinate_text = "(" + str(round(Xtarget)) + "," + str(round(Ytarget)) + "," + str(round(Ztarget)) + ")"
                    #cv2.putText(im0, text=coordinate_text, org=(int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2) + 60),
                    #fontFace = cv2.FONT_HERSHEY_PLAIN, fontScale = 0.5, color=WHITE, thickness=2, lineType = cv2.LINE_AA)
                    cv2.putText(im0, coordinate_text, (int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2) + 60), cv2.FONT_HERSHEY_PLAIN, 1, WHITE, thickness=2, lineType=cv2.LINE_AA)
                    print("(" + str(round(Xtarget)) + "," + str(round(Ytarget)) + "," + str(round(Ztarget)) + ")")

                    ## Display Roll and Pitch Angles ##
                    label_roll = "Roll : " + str(round(gamma, 2))
                    label_pitch = "Pitch : " + str(round(alpha, 2))
                    cv2.putText(im0, label_roll, (int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2)), cv2.FONT_HERSHEY_PLAIN, 1, WHITE, thickness=2, lineType=cv2.LINE_AA)
                    cv2.putText(im0, label_pitch, (int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2) + 20), cv2.FONT_HERSHEY_PLAIN, 1, WHITE, thickness=2, lineType=cv2.LINE_AA)
                    print("Roll : " + str(round(gamma, 2)) + ", Pitch : " + str(round(alpha, 2)))

                    ## Display Yaw Angle ##
                    cvtR2D = np.rad2deg(yaw)
                    #label_yaw = "Yaw: " + str(round(-int(np.rad2deg(angle)) - 90), 2)
                    label_yaw = "Yaw: " + str(round((-int(cvtR2D) - 90), 2))
                    #label_yaw = "Yaw: " + str(round(np.rad2deg(angle)) - 90, 2)
                    #textbox = cv.rectangle(img, (cntr[0], cntr[1]-25), (cntr[0] + 250, cntr[1] + 10), (255,255,255), -1)
                    #cv2.putText(im0, label_yaw, (cntr[0], cntr[1]), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,0), 1, cv2.LINE_AA)
                    cv2.putText(im0, label_yaw, (int((xyxy[0] + xyxy[2])/2) - 40, int((xyxy[1] + xyxy[3])/2) + 40), cv2.FONT_HERSHEY_PLAIN, 1, WHITE, thickness=2, lineType=cv2.LINE_AA)
                    print("Yaw: ", round(yaw, 2))
                    # roll = round(asin(wide/wide_0)*180/pi, 1)

                    # pitch = round(asin(high/high_0)*180/pi, 1)

                    # cv2.putText(im0, "{} degree".format(roll), (centerx, centery + 50), 0, 0.5, (0, 0, 255), 2)
                    # cv2.putText(im0, "{} degree".format(pitch), (centerx, centery + 70), 0, 0.5, (0, 0, 255), 2)
                    #print(center_point_updated)
                    #cv2.circle(im0, max_xy, 3, (255, 0, 0))
                    #annotator.draw.polygon(segments[j], outline=colors(c, True), width=3)

            # Stream results
            if view_img:
                if platform.system() == 'Linux' and p not in windows:
                    windows.append(p)
                    cv2.namedWindow(str(p), cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)  # allow window resize (Linux)
                    cv2.resizeWindow(str(p), im0.shape[1], im0.shape[0])
                cv2.imshow(str(p), im0)
                if cv2.waitKey(1) == ord('q'):  # 1 millisecond
                    exit()

        # Print time (inference-only)
        LOGGER.info(f"{s}{'' if len(det) else '(no detections), '}{dt[1].dt * 1E3:.1f}ms")

    # Print results
    t = tuple(x.t / seen * 1E3 for x in dt)  # speeds per image
    LOGGER.info(f'seg:%.1fms seg, Speed: %.1fms pre-process, %.1fms inference, %.1fms NMS per image at shape {(1, 3, *imgsz)}' % t)

def parse_opt():
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', nargs='+', type=str, default=ROOT / 'yolov5s-seg.pt', help='model path(s)')
    parser.add_argument('--source', type=str, default=ROOT / 'data/images', help='file/dir/URL/glob/screen/0(webcam)')
    parser.add_argument('--data', type=str, default=ROOT / 'data/coco128.yaml', help='(optional) dataset.yaml path')
    parser.add_argument('--imgsz', '--img', '--img-size', nargs='+', type=int, default=[640], help='inference size h,w')
    parser.add_argument('--conf-thres', type=float, default=0.25, help='confidence threshold')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='NMS IoU threshold')
    parser.add_argument('--max-det', type=int, default=1000, help='maximum detections per image')
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
    parser.add_argument('--view-img', action='store_true', help='show results')
    parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
    parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
    parser.add_argument('--save-crop', action='store_true', help='save cropped prediction boxes')
    parser.add_argument('--nosave', action='store_true', help='do not save images/videos')
    parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --classes 0, or --classes 0 2 3')
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    parser.add_argument('--augment', action='store_true', help='augmented inference')
    parser.add_argument('--visualize', action='store_true', help='visualize features')
    parser.add_argument('--update', action='store_true', help='update all models')
    parser.add_argument('--project', default=ROOT / 'runs/predict-seg', help='save results to project/name')
    parser.add_argument('--name', default='exp', help='save results to project/name')
    parser.add_argument('--exist-ok', action='store_true', help='existing project/name ok, do not increment')
    parser.add_argument('--line-thickness', default=3, type=int, help='bounding box thickness (pixels)')
    parser.add_argument('--hide-labels', default=False, action='store_true', help='hide labels')
    parser.add_argument('--hide-conf', default=False, action='store_true', help='hide confidences')
    parser.add_argument('--half', action='store_true', help='use FP16 half-precision inference')
    parser.add_argument('--dnn', action='store_true', help='use OpenCV DNN for ONNX inference')
    parser.add_argument('--vid-stride', type=int, default=1, help='video frame-rate stride')
    parser.add_argument('--retina-masks', action='store_true', help='whether to plot masks in native resolution')
    opt = parser.parse_args()
    opt.imgsz *= 2 if len(opt.imgsz) == 1 else 1  # expand
    print_args(vars(opt))
    return opt

def main(opt):
    check_requirements(ROOT / 'requirements.txt', exclude=('tensorboard', 'thop'))
    run(**vars(opt))

if __name__ == '__main__':
    opt = parse_opt()
    main(opt)

It appears and error:

Fusing layers... 
YOLOv5s-seg summary: 224 layers, 7611485 parameters, 0 gradients, 26.4 GFLOPs
[ WARN:0@3.658] global cap_v4l.cpp:982 open VIDEOIO(V4L2:/dev/video4): can't open camera by index
[ERROR:0@3.659] global obsensor_uvc_stream_channel.cpp:156 getStreamChannelGroup Camera index out of range
Traceback (most recent call last):
  File "test_seg_2.py", line 515, in <module>
    main(opt)
  File "test_seg_2.py", line 511, in main
    run(**vars(opt))
  File "test_seg_2.py", line 181, in run
    dataset = LoadStreams(source, img_size=imgsz, stride=stride, auto=pt, vid_stride=vid_stride)
  File "/home/anthonia/Thesis/src/robot_project/yolov5-master/utils/dataloaders.py", line 367, in __init__
    assert cap.isOpened(), f'{st}Failed to open {s}'
AssertionError: 1/1: 4... Failed to open 4

And with the code above, and move the code to open realsense camera into def get_frame stream() function. The camera is open but when the camera can detect a thing, the program is crashed and gives the errors:

profile = pipe.start(cfg)
RuntimeError: xioctl(VIDIOC_S_FMT) failed, errno=16 Last Error: Device or resource busy
terminate called without an active exception
Aborted (core dumped)

Environment

YOLOv5 🚀 2023-10-3 Python-3.8.10 torch-2.1.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3050 Laptop GPU, 3910MiB) OS: Ubuntu 20.04 Python version: 3.8.10 distributed with ROS Noetic

MartyG-RealSense commented 1 year ago

Hi @Anthonia2001 Are the modified code from yolov5s.pt model and the modified predict.py two separate scripts? Both contain a pipe start instruction.

So if the modified yolov5s.pt script was run first and then the modified predict.py script was run secondly afterwards, the camera would be 'busy' for the predict.py script because the yolov5s.pt script would have already started the camera's pipeline.

Anthonia2001 commented 1 year ago

Hi! Thanks for your reply. yeah, They are two different scripts. The code from yolov5s.pt model and the code from yolov5s-seg.pt model are in two different projects. After I stop execute the script from the yolov5s model, and run the modified predict.py script from other project. The camera is busy.

On Sat, 11 Nov 2023 at 15:03 MartyG-RealSense @.***> wrote:

Hi @Anthonia2001 https://github.com/Anthonia2001 Is the modified code from yolov5s.pt model and the modified predict.py two separate scripts? Both contain a pipe start instruction.

So if the modified yolov5s.pt script was run first and then the modified predict.py script was run secondly afterwards, the camera would be 'busy' for the predict.py script because the yolov5s.pt script would have already started the camera's pipeline.

— Reply to this email directly, view it on GitHub https://github.com/IntelRealSense/librealsense/issues/12391#issuecomment-1806738560, or unsubscribe https://github.com/notifications/unsubscribe-auth/BDB42XRLNZYSNSZ3CQ5TZITYD4WO5AVCNFSM6AAAAAA7GGMYGGVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQMBWG4ZTQNJWGA . You are receiving this because you were mentioned.Message ID: @.***>

MartyG-RealSense commented 1 year ago

If you reboot your computer and then run predict.py first without running the yolov5s.pt script beforehand and it works, that could indicate that the method by which you stop executing the yolov5s.pt script is not properly closing down the pipeline, preventing it from working properly when predict.py is run.

Anthonia2001 commented 1 year ago

Yeah, actually I have tried what you say before. I shut down my PC and then start with the predict.py first but it still doesn’t work. I think that maybe it is some conflicts between rs and yolov5s-seg.pt model on Ubuntu right? On Sat, 11 Nov 2023 at 15:44 MartyG-RealSense @.***> wrote:

If you reboot your computer and then run predict.py first without running the yolov5s.pt script beforehand and it works, that could indicate that the method by which you stop executing the yolov5s.pt script is not properly closing down the pipeline, preventing it from working properly when predict.py is run.

— Reply to this email directly, view it on GitHub https://github.com/IntelRealSense/librealsense/issues/12391#issuecomment-1806752727, or unsubscribe https://github.com/notifications/unsubscribe-auth/BDB42XREKI7MVUVEM3ACRQDYD43FLAVCNFSM6AAAAAA7GGMYGGVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQMBWG42TENZSG4 . You are receiving this because you were mentioned.Message ID: @.***>

MartyG-RealSense commented 1 year ago

You are the first RealSense user to report having used yolov5s-seg so there is no way to be certain. It would be reasonable to conclude though that if yolov5 works and yolov5s-seg does not then there is an issue with yolov5s-seg specifically.

Anthonia2001 commented 1 year ago

Really appreciate for your help. I have uploaded the issue on the yolov5 github. Hope that I will find a solution. On Sat, 11 Nov 2023 at 15:53 MartyG-RealSense @.***> wrote:

You are the first RealSense user to report having used yolov5s-seg so there is no way to be certain. It would be reasonable to conflude though that if yolov5 works and yolov5s-seg does not then there is an issue with yolov5s-seg specifically.

— Reply to this email directly, view it on GitHub https://github.com/IntelRealSense/librealsense/issues/12391#issuecomment-1806754751, or unsubscribe https://github.com/notifications/unsubscribe-auth/BDB42XRPQOOIV5EB45ENITDYD44JFAVCNFSM6AAAAAA7GGMYGGVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQMBWG42TINZVGE . You are receiving this because you were mentioned.Message ID: @.***>

MartyG-RealSense commented 1 year ago

You are very welcome. I'm pleased that I could be of help. Good luck!

Anthonia2001 commented 1 year ago

Thanks a lot. Have a nice weekend!

On Sat, 11 Nov 2023 at 15:58 MartyG-RealSense @.***> wrote:

You are very welcome. I'm pleased that I could be of help. Good luck!

— Reply to this email directly, view it on GitHub https://github.com/IntelRealSense/librealsense/issues/12391#issuecomment-1806755534, or unsubscribe https://github.com/notifications/unsubscribe-auth/BDB42XROKZZB6QMUZ4CTMD3YD44ZXAVCNFSM6AAAAAA7GGMYGGVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMYTQMBWG42TKNJTGQ . You are receiving this because you were mentioned.Message ID: @.***>

MartyG-RealSense commented 1 year ago

Case closed due to no further comments received.