Closed SokPhanith closed 1 year ago
@SokPhanith you may want to add more timing around your video capture and pre-processing to find what is slowing it down. It looks like your pre-processing is using numpy whereas I use CUDA.
Yeah Thank you, Because of pre-processing is using numpy.
Sorry, I would to ask you. I build tensorrt model alexnet with /usr/src/tensorrt/bin/trtexec fp16 and then test python like below : import pycuda.autoinit import pycuda.driver as cuda import tensorrt as trt import sys import cv2 import time import numpy as np import argparse from caffe.proto import caffe_pb2 def gstreamer_pipeline( capture_width=3280, capture_height=2464, display_width=640, display_height=480, framerate=21, flip_method=0,): return ( "nvarguscamerasrc ! " "video/x-raw(memory:NVMM), " "width=(int)%d, height=(int)%d, " "format=(string)NV12, framerate=(fraction)%d/1 ! " "nvvidconv flip-method=%d ! " "video/x-raw, width=(int)%d, height=(int)%d, format=(string)BGRx ! " "videoconvert ! " "video/x-raw, format=(string)BGR ! appsink drop=True" % ( capture_width, capture_height, framerate, flip_method, display_width, display_height,)) def open_cam_usb(dev,width,height): if USB_GSTREAMER: gst_str = ('v4l2src device=/dev/video{} ! ' 'video/x-raw, width=(int){}, height=(int){} ! ' 'videoconvert ! appsink').format(dev, width, height) return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER) else: return cv2.VideoCapture(dev) def get_parser(): parser = argparse.ArgumentParser(description="TensorRT runtime with caffe DeepLearning") parser.add_argument("--model",default="deploy.engine",help="path to tensorrt model.") parser.add_argument("--label",default="labels.txt",help="path to labels.txt one line label and no-background.") parser.add_argument("--csi", type=str, default=None, help="Take inputs from picamera.[o or 1]") parser.add_argument("--webcam", type=str, default=None,help="Take inputs from webcam /dev/video.") parser.add_argument('--image', type=str, default=None,help='path to image file name') parser.add_argument("--video",type=str,default=None,help="Path to video file.") parser.add_argument("--mean",type=str,default=None,help="mean file if model train on.") parser.add_argument("--height",type=int,default=224,help="height input image.") parser.add_argument("--width",type=int,default=224,help="width input image.") parser.add_argument("--height_display",type=int,default=480,help="height input image.") parser.add_argument("--width_display",type=int,default=640,help="width input image.") parser.add_argument("--batch_size",type=int,default=1,help="batch size input image.") parser.add_argument("--channel",type=int,default=3,help="channel input image.") return parser class HostDeviceMem(object): def init(self, host_mem, device_mem): self.host = host_mem self.device = device_mem def str(self): return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device) def repr(self): return self.str() def allocate_buffers(engine): inputs = [] outputs = [] bindings = [] stream = cuda.Stream() for binding in engine: size = trt.volume(engine.get_binding_shape(binding)) engine.max_batch_size dtype = trt.nptype(engine.get_binding_dtype(binding)) host_mem = cuda.pagelocked_empty(size, dtype) device_mem = cuda.mem_alloc(host_mem.nbytes) bindings.append(int(device_mem)) if engine.binding_is_input(binding): inputs.append(HostDeviceMem(host_mem, device_mem)) else: outputs.append(HostDeviceMem(host_mem, device_mem)) return inputs, outputs, bindings, stream def do_inference(context, bindings, inputs, outputs, stream, batch_size=1): [cuda.memcpy_htod_async(inp.device, inp.host, stream) for inp in inputs] context.execute_async(batch_size=batch_size, bindings=bindings, stream_handle=stream.handle) [cuda.memcpy_dtoh_async(out.host, out.device, stream) for out in outputs] stream.synchronize() return [out.host for out in outputs] if name == "main": args = get_parser().parse_args() if args.csi: print("csi using") cam = cv2.VideoCapture(gstreamer_pipeline(flip_method=0,display_width=args.width_display,display_height=args.height_display),cv2.CAP_GSTREAMER) elif args.image: print("image for classificatin") print(args.image) elif args.webcam: print('webcam using') cam = open_cam_usb(int(args.webcam),args.width_display,args.height_display) elif args.video: print('video for classification') cam = cv2.VideoCapture(args.video) else: print('None source for input need image, video, csi or webcam') sys.exit() if args.mean: mean_blob = caffe_pb2.BlobProto() try: data = open(args.mean,'rb').read() except: data = open(args.mean,'r').read() mean_blob.ParseFromString(data) mean_array = np.asarray(mean_blob.data, dtype=np.float32).reshape((mean_blob.height, mean_blob.width,mean_blob.channels)) mean_array = np.asarray(mean_blob.data, dtype=np.float32).reshape((mean_blob.height, mean_blob.width,mean_blob.channels)) else: mean_array = np.array([[[104., 117., 123.]]], dtype=np.float32) print("mean shape :",mean_array.shape) font = cv2.FONT_HERSHEY_PLAIN line = cv2.LINE_AA TRT_LOGGER = trt.Logger(trt.Logger.INFO) runtime = trt.Runtime(TRT_LOGGER) window_name = args.model.split('.')[0] print('model :',args.model) with open(args.model, "rb") as f: engine = runtime.deserialize_cuda_engine(f.read()) context = engine.create_execution_context() inputs, outputs, bindings, stream = allocatebuffers(engine) print('label :',args.label) with open(args.label) as labels: labels = [i.strip() for i in labels.readlines()] while True: if args.image: frame = cv2.imread(args.image) else: , frame = cam.read() img = cv2.resize(frame,(args.width,args.height)) img = img.astype(np.float32) - mean_array img = img.transpose((2, 0, 1)) data = np.reshape(img,(args.batch_sizeargs.channelargs.heightargs.width))
inputs[0].host = data t1 = time.time() [trt_outputs] = do_inference(context, bindings=bindings, inputs=inputs, outputs=outputs, stream=stream,batch_size=args.batch_size) dt = time.time() - t1 pred_index = trt_outputs.argmax() classes = labels[pred_index] percent = trt_outputs[pred_index] cv2.putText(frame,'{}: {:.2f}%'.format(classes, round(100percent,2)), (11, 40), font, 1.0, (32, 32, 32), 4, line) cv2.putText(frame,'{}: {:.2f}%'.format(classes, round(100*percent,2)), (10, 40), font, 1.0, (0, 240, 240), 1, line) cv2.putText(frame, str(round(1.0/dt,2))+' fps', (11, 20), font, 1.0, (32, 32, 32), 4, line) cv2.putText(frame, str(round(1.0/dt,2))+' fps', (10, 20), font, 1.0, (0, 240, 240), 1, line) cv2.imshow(window_name,frame) cv2.moveWindow(window_name,0,0) if cv2.waitKey(1) == ord('q'): break cv2.destroyAllWindows() cam.release() I got frame rate around 30fps+ but when run test with jetson-inference/build/aarch64/bin/imagenet-camera.py got 50fps+ the same is alexnet. How can I improve my code ? or need write with threading.