GPU memory leak while run def detect() in QThread

Nervol commented 2 years ago

Hi there. I am trying to understand the memory leak issue in my multithreaded QT application. In general, QThread class has the following structure:

class videoThread(QtCore.QObject):
    signalPixmap = QtCore.pyqtSignal(np.ndarray)
    countChanged = QtCore.pyqtSignal(int)

    def __init__(self, deviceList, pickedIndex, stopFlag) -> None:
        super().__init__()
        self.deviceList = deviceList
        self.pickedIndex = pickedIndex
        self.cam = MvCamera()
        self.rotAngle = 0
        self.thresh = 1
        self.peaksArray = []
        self.stopFlag = stopFlag
        self.cam = MvCamera() #hikvision camera use

        self.weights ='yolov7-tiny.pt'  # model.pt path(s)
        self.source = '0'  # file/dir/URL/glob, 0 for webcam
        self.img_size = 640
        self.conf_thres = 0.25  # confidence threshold
        self.iou_thres = 0.45 # NMS IOU threshold
        self.device = ''  # cuda device, i.e. 0 or 0,1,2,3 or cpu
        self.view_img = True  # show results
        self.save_txt = False  # save results to *.txt
        self.save_conf = False  # save confidences in --save-txt labels
        self.save_crop = False  # save cropped prediction boxes
        self.nosave = True  # do not save images/videos
        self.classes = None  # filter by class: --class 0, or --class 0 2 3
        self.agnostic_nms = None  # class-agnostic NMS
        self.augment = False
        self.update = False  # update all models
        self.project = 'runs/detect'  # save results to project/name
        self.name = 'exp'  # save results to project/name
        self.exist_ok = False  # existing project/name ok, do not increment
        self.no_trace = False

        self.device = select_device(self.device)
        self.half = self.device.type != 'cpu'  # half precision only supported on CUDA
        self.model = attempt_load(self.weights, map_location=self.device)  # load FP32 model
        self.stride = int(self.model.stride.max())
        self.imgsz = check_img_size(self.img_size, s=self.stride)  # check img_size
        self.model.half()

        # Get names and colors
        self.names = self.model.module.names if hasattr(self.model, 'module') else self.model.names
        self.colors = [[random.randint(0, 255) for _ in range(3)] for _ in self.names]

        # Run inference
        if self.device.type != 'cpu':
            self.model(torch.zeros(1, 3, self.imgsz, self.imgsz).to(self.device).type_as(next(self.model.parameters())))  # run once

    def runStream(self): ##CALLING detect() by passing grabbed frame into it

    def detect(self, img_buff):
        vid_path, vid_writer = None, None
        cudnn.benchmark = True  # set True to speed up constant image size inference

        old_img_w = old_img_h = self.imgsz
        old_img_b = 1

        t0 = time.time()

        imgs = [None] * 1
        imgs[0] = img_buff
        img0 = imgs.copy()

        img = [letterbox(x, self.img_size, auto=True, stride=self.stride)[0] for x in img0]
        # Stack
        img = np.stack(img, 0)

        # Convert
        img = img[:, :, :, ::-1].transpose(0, 3, 1, 2)  # BGR to RGB, to bsx3x416x416
        img = np.ascontiguousarray(img)

        img = torch.from_numpy(img).to(self.device)
        img = img.half() if self.half else img.float()  # uint8 to fp16/32
        img /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        # Warmup
        if self.device.type != 'cpu' and (
                old_img_b != img.shape[0] or old_img_h != img.shape[2] or old_img_w != img.shape[3]):
            old_img_b = img.shape[0]
            old_img_h = img.shape[2]
            old_img_w = img.shape[3]
            for i in range(3):
                self.model(img, augment=False)[0]

        # Inference
        t1 = time_synchronized()
        print(f'MEMORY: {torch.cuda.mem_get_info(device=0)}')
        pred = self.model(img, self.augment)[0]
        print(f'MEMORY: {torch.cuda.mem_get_info(device=0)}')
        t2 = time_synchronized()

        # Apply NMS
        pred = non_max_suppression(pred, self.conf_thres, self.iou_thres, classes=self.classes,
                                   agnostic=self.agnostic_nms)
        t3 = time_synchronized()

        # Process detections
        for i, det in enumerate(pred):  # detections per image
            s, im0 = '%g: ' % i, img0[i].copy()
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()

                # Print results
                for c in det[:, -1].unique():
                    n = (det[:, -1] == c).sum()  # detections per class
                    s += f"{n} {self.names[int(c)]}{'s' * (n > 1)}, "  # add to string

                # Write results
                for *xyxy, conf, cls in reversed(det):

                    if self.view_img:  # Add bbox to image
                        label = f'{self.names[int(cls)]} {conf:.2f}'
                        plot_one_box(xyxy, im0, label=label, color=self.colors[int(cls)], line_thickness=1)

            # Print time (inference + NMS)
            print(f'{s}Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference, ({(1E3 * (t3 - t2)):.1f}ms) NMS')
            t4 = time.time()
            timedif = t4 - t0
            fps = 1 / (timedif)
            t0 = t4
            print("FPS: {:.2f}".format(fps))
            pred = None
            # Check Memory
            if self.view_img:
                print(f'MEMORY: {torch.cuda.mem_get_info(device=0)}')
        return im0
        pass

The detect() method is called from the run Stream() method, which is started when QThread starts. With each execution of pred = self.model(img, self.augment)[0], I perform a memory check. When starting recognition, I get an overflow error of the type "RuntimeError: CODE out of memory. Tried to allocate...", since the amount of available memory decreases with EACH tensor detection with information about detected objects. If nothing is detected, the memory does not decrease. As console output example on 3070Ti with 8 Gb RAM:

MEMORY: (254803968, 8589410304)
MEMORY: (254803968, 8589410304)
0: 1 person, Done. (9.8ms) Inference, (1.0ms) NMS
FPS: 23.08
MEMORY: (254803968, 8589410304)
18.552628319680107
MEMORY: (191889408, 8589410304)
MEMORY: (191889408, 8589410304)
0: 1 person, Done. (9.2ms) Inference, (1.0ms) NMS
FPS: 22.11
MEMORY: (191889408, 8589410304)
17.87137293667499
MEMORY: (152043520, 8589410304)
MEMORY: (152043520, 8589410304)
0: 1 person, Done. (7.6ms) Inference, (1.0ms) NMS
FPS: 24.33
MEMORY: (152043520, 8589410304)
19.394728567465087
MEMORY: (89128960, 8589410304)
MEMORY: (89128960, 8589410304)
0: 1 person, Done. (9.8ms) Inference, (1.1ms) NMS
FPS: 21.36
MEMORY: (89128960, 8589410304)
17.986714639198247
MEMORY: (26214400, 8589410304)
MEMORY: (26214400, 8589410304)
0: 1 person, Done. (8.0ms) Inference, (1.0ms) NMS
FPS: 22.96
MEMORY: (26214400, 8589410304)
18.59012498891942
MEMORY: (0, 8589410304)

I want to note that such code based on YOLOv5 does not cause any problems with memory leaks - the problem is in YOLOv7. Also, there are no errors when using standard Thread instead of QThread.

Nervol commented 2 years ago

Also if you interested how i call the thread:

self.videoThread = QtCore.QThread()
        print(self.videoThread.isRunning())
        self.video = videoThread(
            self.deviceList, self.comboBoxOfDevices.currentIndex(), self.stopFlag == False)
        self.video.moveToThread(self.videoThread)
        self.videoThread.started.connect(self.video.runStream)
        self.video.signalPixmap.connect(self.showImage)

        self.videoThread.start()

awarebayes commented 2 years ago

Same issue

petpetpeter commented 2 years ago

When try making my own inference code, I found the gpu leakage problem. I found that the problem disappeared when I comment out the non max suppression function

pred = non_max_suppression(pred, self.conf_threshold)

So, I try this and it worked on my case

with torch.no_grad():
            pred = non_max_suppression(torch.tensor(pred), conf_thres=self.conf, iou_thres=0.6)

its-jd commented 2 years ago

Have a look at this: https://github.com/WongKinYiu/yolov7/pull/900

Nervol commented 2 years ago

Thanks, seems to be gradient problem.

When try making my own inference code, I found the gpu leakage problem. I found that the problem disappeared when I comment out the non max suppression function
pred = non_max_suppression(pred, self.conf_threshold)
So, I try this and it worked on my case
with torch.no_grad():
            pred = non_max_suppression(torch.tensor(pred), conf_thres=self.conf, iou_thres=0.6)

WongKinYiu / yolov7

GPU memory leak while run def detect() in QThread #883