RAM gets full in video inference

simaiden commented 3 years ago

Env

GPU Xavier NX
OS Ubuntu18.05
Cuda version 10.2
TensorRT version 7.7
Docker image JetPack 4.4.1 (L4T R32.4.4)

About this repo

yolov5m

Your problem

When using yolov5_trt.py in video inference, the RAM memory gets full even when the model have not predicted the first frame. Here es my code:

class YoloTRTPredictor(Predictor):
    def __init__(self,cfg):
        super().__init__(cfg)
        self.cfg = cfg               

        self.INPUT_W = 608
        self.INPUT_H = 608
        self.CONF_THRESH = 0.5
        self.IOU_THRESHOLD = 0.4

        self.load_model()

    def predict(self,x,**kwargs):
        #threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.
        self.cfx.push()
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        input_image, image_raw, origin_h, origin_w = x#self.preprocess_image(
           # x
        #)
        # Copy input image to host buffer
        np.copyto(host_inputs[0], input_image.ravel())
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        result_boxes, result_scores, result_classid = self.post_process(
            output, origin_h, origin_w
        )
        count = result_boxes.shape[0]
        print('prediccion')
        output = {'count' : count,
                'detections':result_boxes,
                'image':x}  
        return output

    def prepare_input(self, image_raw):
        """
        description: Read an image from image path, convert it to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """

        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.INPUT_W / w
        r_h = self.INPUT_H / h
        if r_h > r_w:
            tw = self.INPUT_W
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.INPUT_H - th) / 2)
            ty2 = self.INPUT_H - th - ty1
        else:
            tw = int(r_h * w)
            th = self.INPUT_H
            tx1 = int((self.INPUT_W - tw) / 2)
            tx2 = self.INPUT_W - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes tensor, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes tensor, each row is a box [x1, y1, x2, y2]
        """
        y = torch.zeros_like(x) if isinstance(x, torch.Tensor) else np.zeros_like(x)
        r_w = self.INPUT_W / origin_w
        r_h = self.INPUT_H / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.INPUT_H - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.INPUT_H - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.INPUT_W - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.INPUT_W - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        return y

    def post_process(self, output, origin_h, origin_w):
        """
        description: postprocess the prediction
        param:
            output:     A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a tensor, each element is the score correspoing to box
            result_classid: finally classid, a tensor, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarray
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # to a torch Tensor
        pred = torch.Tensor(pred).cuda()
        # Get the boxes
        boxes = pred[:, :4]
        # Get the scores
        scores = pred[:, 4]
        # Get the classid
        classid = pred[:, 5]
        # Choose those boxes that score > CONF_THRESH
        si = scores > self.CONF_THRESH
        boxes = boxes[si, :]
        scores = scores[si]
        classid = classid[si]
        # Trandform bbox from [center_x, center_y, w, h] to [x1, y1, x2, y2]
        boxes = self.xywh2xyxy(origin_h, origin_w, boxes)
        # Do nms
        indices = torchvision.ops.nms(boxes, scores, iou_threshold=self.IOU_THRESHOLD).cpu()
        result_boxes = boxes[indices, :].cpu()
        result_scores = scores[indices].cpu()
        result_classid = classid[indices].cpu()
        return result_boxes, result_scores, result_classid

    def load_model(self):

        self.PLUGIN_LIBRARY = self.cfg['MODEL']['PLUGIN_LIBRARY']
        ctypes.CDLL(self.PLUGIN_LIBRARY)

        self.cfx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        engine_file_path = self.cfg['MODEL']['CHKPT']
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings       

    def destroy(self):
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()

Thanks! Any help will be appreciated

wang-xinyu commented 3 years ago

yolov5_trt.py cannot process video. you need adapt it yourself.

simaiden commented 3 years ago

yolov5_trt.py cannot process video. you need adapt it yourself.

Hi @wang-xinyu , thanks for the reply!

I already adapted the code, the class shown below call the "preprocess_input" chained with "predict" in every frame, but i not sure if the rest of the class it's right because i'm not familair with cuda. Althought now I can process the video, I think my problem was that tensorrtx was not compiled with opencv dnn.

simaiden commented 3 years ago

This is jtop command when doing video inference with a yolov5s.engine model, notice the swap memory usage

imagen

wang-xinyu commented 3 years ago

someone said torchvision.ops.nms consume a lot gpu memory. You can try the c++ exec. It's not recommended to use python on devices like NX.

simaiden commented 3 years ago

someone said torchvision.ops.nms consume a lot gpu memory. You can try the c++ exec. It's not recommended to use python on devices like NX.

Thanks! It seems that pytorch use different cuda context that the declared cuda context in yolov5_trt.py inference function. Althought i tried using nms only in cpu didn't notice less memory usage nor fps decrease. If any have an advice for this I would really appreciated! Can this issue remains open until this?

For now I will try INT8 inference and check the performance

Thanks @wang-xinyu for your help and of course for the amazing job with this repo.

wang-xinyu commented 3 years ago

So does the c++ have the RAM issue?

simaiden commented 3 years ago

So does the c++ have the RAM issue?

With yolov5_trt.py over a folder of images the rams still gets at high usage, about 7GB.

With the compiled cpp version and the command sudo ./yolov5 -d yolo-s.engine image_folder only use about 3.2GB, so yes.. must be a python/pycuda/torch "issue".

Do you know a way to call the ./yolov5 but from python? I can't use cpp for the overall program.

And again, thanks!

simaiden commented 3 years ago

So does the c++ have the RAM issue?

With yolov5_trt.py over a folder of images the rams still gets at high usage, about 7GB.

With the compiled cpp version and the command sudo ./yolov5 -d yolo-s.engine image_folder only use about 3.2GB, so yes.. must be a python/pycuda/torch "issue".

Do you know a way to call the ./yolov5 but from python? I can't use cpp for the overall program.

And again, thanks!

Good news, I replace the torchvision nms implementation with opencv.dnn.nms implementation and now use only 3.5GB!! Also I noticed that even when not use any torch function but import it, use 1.5GB extra .. seems very weird to me, but at least I've found the main problem

wang-xinyu commented 3 years ago

OK, good to hear, you can call c++ executables using python subprocess.

abuelgasimsaadeldin commented 3 years ago

Env

GPU Jetson Nano OS Ubuntu18.04 Cuda version 10.2 TensorRT version 7.1.3.0 Docker image JetPack 4.5

About this repo

yolov5s

Hi @simaiden, I modified my python_trt.py script to run video inference on my Nano and I'm getting memory problems when trying to run, not sure if I am possibly missing something or I'm doing something wrong, I am trying to troubleshoot right now but I was also wondering if you would mind sharing your modified python_trt.py that worked for you on your Xavier NX? Would highly appreciate it. thanks.

simaiden commented 3 years ago

Env

GPU Jetson Nano OS Ubuntu18.04 Cuda version 10.2 TensorRT version 7.1.3.0 Docker image JetPack 4.5

About this repo

yolov5s

Hi @simaiden, I modified my python_trt.py script to run video inference on my Nano and I'm getting memory problems when trying to run, not sure if I am possibly missing something or I'm doing something wrong, I am trying to troubleshoot right now but I was also wondering if you would mind sharing your modified python_trt.py that worked for you on your Xavier NX? Would highly appreciate it. thanks.

Yes I can, but take into account that the Jetson Nano have only 4GB of memory, and yolov5s in my Xavier NX (8GB) uses 3.5GB (more or less). Maybe you could try to increase the swap memory https://github.com/JetsonHacksNano/resizeSwapMemory, but it's not a final solution.

This is my code

import ctypes
import os
import random
import sys
import threading
import time
import cv2
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
import tensorrt as trt
import logging

class YoloTRTPredictor():
    def __init__(self,cfg):
        self.cfg = cfg               

        self.INPUT_W = self.cfg['MODEL']['IMG_SIZE']
        self.INPUT_H = self.cfg['MODEL']['IMG_SIZE']
        self.CONF_THRESH = 0.5
        self.IOU_THRESHOLD = 0.4

        self.load_model()
        self.cfg_popped = True

    def predict(self,x,**kwargs):
        #threading.Thread.__init__(self)
        # Make self the active context, pushing it on top of the context stack.        
        self.cfx.push()
        self.cfg_popped = False
        # Restore
        stream = self.stream
        context = self.context
        engine = self.engine
        host_inputs = self.host_inputs
        cuda_inputs = self.cuda_inputs
        host_outputs = self.host_outputs
        cuda_outputs = self.cuda_outputs
        bindings = self.bindings
        # Do image preprocess
        input_image, image_raw, origin_h, origin_w = x#self.preprocess_image(
           # x
        #)
        # Copy input image to host buffer
        np.copyto(host_inputs[0], input_image.ravel())
        # Transfer input data  to the GPU.
        cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
        # Run inference.
        context.execute_async(bindings=bindings, stream_handle=stream.handle)
        # Transfer predictions back from the GPU.
        cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
        # Synchronize the stream
        stream.synchronize()
        # Remove any context from the top of the context stack, deactivating it.
        self.cfx.pop()
        self.cfg_popped = True
        # Here we use the first row of output in that batch_size = 1
        output = host_outputs[0]
        # Do postprocess
        result_boxes, result_scores, result_classid = self.post_process(
            output, origin_h, origin_w
        )
        count = result_boxes.shape[0]
        output = {'count' : count,
                'detections':result_boxes,
                'image':image_raw}  
        return output    

    def prepare_input(self, image_raw):
        """
        description: Read an image from image path, convert it to RGB,
                     resize and pad it to target size, normalize to [0,1],
                     transform to NCHW format.
        param:
            input_image_path: str, image path
        return:
            image:  the processed image
            image_raw: the original image
            h: original height
            w: original width
        """

        h, w, c = image_raw.shape
        image = cv2.cvtColor(image_raw, cv2.COLOR_BGR2RGB)
        # Calculate widht and height and paddings
        r_w = self.INPUT_W / w
        r_h = self.INPUT_H / h
        if r_h > r_w:
            tw = self.INPUT_W
            th = int(r_w * h)
            tx1 = tx2 = 0
            ty1 = int((self.INPUT_H - th) / 2)
            ty2 = self.INPUT_H - th - ty1
        else:
            tw = int(r_h * w)
            th = self.INPUT_H
            tx1 = int((self.INPUT_W - tw) / 2)
            tx2 = self.INPUT_W - tw - tx1
            ty1 = ty2 = 0
        # Resize the image with long side while maintaining ratio
        image = cv2.resize(image, (tw, th))
        # Pad the short side with (128,128,128)
        image = cv2.copyMakeBorder(
            image, ty1, ty2, tx1, tx2, cv2.BORDER_CONSTANT, (128, 128, 128)
        )
        image = image.astype(np.float32)
        # Normalize to [0,1]
        image /= 255.0
        # HWC to CHW format:
        image = np.transpose(image, [2, 0, 1])
        # CHW to NCHW format
        image = np.expand_dims(image, axis=0)
        # Convert the image to row-major order, also known as "C order":
        image = np.ascontiguousarray(image)
        return image, image_raw, h, w

    def xywh2xyxy(self, origin_h, origin_w, x):
        """
        description:    Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
        param:
            origin_h:   height of original image
            origin_w:   width of original image
            x:          A boxes tensor, each row is a box [center_x, center_y, w, h]
        return:
            y:          A boxes tensor, each row is a box [x1, y1, x2, y2]
        """
        y = np.zeros_like(x)
        r_w = self.INPUT_W / origin_w
        r_h = self.INPUT_H / origin_h
        if r_h > r_w:
            y[:, 0] = x[:, 0] - x[:, 2] / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2 - (self.INPUT_H - r_w * origin_h) / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2 - (self.INPUT_H - r_w * origin_h) / 2
            y /= r_w
        else:
            y[:, 0] = x[:, 0] - x[:, 2] / 2 - (self.INPUT_W - r_h * origin_w) / 2
            y[:, 2] = x[:, 0] + x[:, 2] / 2 - (self.INPUT_W - r_h * origin_w) / 2
            y[:, 1] = x[:, 1] - x[:, 3] / 2
            y[:, 3] = x[:, 1] + x[:, 3] / 2
            y /= r_h

        #back to x,y,w,h format
        y[:,2] =  y[:,2] - y[:,0]
        y[:,3] =  y[:,3] - y[:,1]
        return y

    def post_process(self, output, origin_h, origin_w): 
        """
        description: postprocess the prediction
        param:
            output:     A tensor likes [num_boxes,cx,cy,w,h,conf,cls_id, cx,cy,w,h,conf,cls_id, ...] 
            origin_h:   height of original image
            origin_w:   width of original image
        return:
            result_boxes: finally boxes, a boxes tensor, each row is a box [x1, y1, x2, y2]
            result_scores: finally scores, a tensor, each element is the score correspoing to box
            result_classid: finally classid, a tensor, each element is the classid correspoing to box
        """
        # Get the num of boxes detected
        num = int(output[0])
        # Reshape to a two dimentional ndarraycd 
        pred = np.reshape(output[1:], (-1, 6))[:num, :]
        # Get the boxes
        boxes = pred[:, :4]
        # Get the scores
        scores = pred[:, 4]
        # Get the classid
        classid = pred[:, 5]
        # Choose those boxes that score > CONF_THRESH

        si = scores > self.CONF_THRESH
        boxes = boxes[si, :]
        scores = scores[si].tolist()
        classid = classid[si].tolist()
        boxes = self.xywh2xyxy(origin_h, origin_w, boxes).tolist()

        # Do nms
        indices = cv2.dnn.NMSBoxes(boxes,scores,score_threshold = self.CONF_THRESH ,nms_threshold=self.IOU_THRESHOLD)
        boxes = np.array(boxes)
        scores = np.array(scores)
        indices = np.array(indices)
        classid = np.array(classid)
        if len(boxes)>0:
            boxes = boxes[indices, :]
            scores = scores[indices]
            classid = classid[indices]
        #return result_boxes, result_scores, result_classid
        boxes = np.reshape(boxes,(-1,4))
        boxes[:,2] =  boxes[:,2] + boxes[:,0]
        boxes[:,3] =  boxes[:,3] + boxes[:,1]

        return boxes, scores,classid

    def load_model(self):

        self.PLUGIN_LIBRARY = self.cfg['MODEL']['PLUGIN_LIBRARY']
        ctypes.CDLL(self.PLUGIN_LIBRARY)

        self.cfx = cuda.Device(0).make_context()
        stream = cuda.Stream()
        TRT_LOGGER = trt.Logger(trt.Logger.INFO)
        runtime = trt.Runtime(TRT_LOGGER)

        # Deserialize the engine from file
        engine_file_path = self.cfg['MODEL']['CHKPT']
        with open(engine_file_path, "rb") as f:
            engine = runtime.deserialize_cuda_engine(f.read())
        context = engine.create_execution_context()

        host_inputs = []
        cuda_inputs = []
        host_outputs = []
        cuda_outputs = []
        bindings = []

        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            dtype = trt.nptype(engine.get_binding_dtype(binding))
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
            # Append the device buffer to device bindings.
            bindings.append(int(cuda_mem))
            # Append to the appropriate list.
            if engine.binding_is_input(binding):
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        # Store
        self.stream = stream
        self.context = context
        self.engine = engine
        self.host_inputs = host_inputs
        self.cuda_inputs = cuda_inputs
        self.host_outputs = host_outputs
        self.cuda_outputs = cuda_outputs
        self.bindings = bindings       

    def stop(self):
        # Remove any context from the top of the context stack, deactivating it.
        if not self.cfg_popped:
            self.cfx.pop()

joneswilliam1 commented 3 years ago

Hey @abuelgasimsaadeldin I am quite new to python and would also like learn how to do the video inference, @simaiden I had a look at your code and I am curious as I cannot find the cv2.VideoCapture in the code to read the webcam or are you missing the __main__? Sorry if my question is inappropriate as I am still learning. I want to run inference using my webcam.

wang-xinyu / tensorrtx