Robert-JunWang / Pelee

Pelee: A Real-Time Object Detection System on Mobile Devices
Apache License 2.0
885 stars 254 forks source link

how can i get the fps=120 on nvidia tx2? please help me #72

Open oujieww opened 5 years ago

oujieww commented 5 years ago

i write a webcam demo for this ,but i can not get 120fps as reported on paper, anyone can help me ? ('cap read frame time : ', 0.03454303741455078) ('detect time: ', 0.1441190242767334)

this is my code import numpy as np import matplotlib.pyplot as plt import time

import os import caffe import cv2 from google.protobuf import text_format from caffe.proto import caffe_pb2 caffe.set_mode_gpu()

load PASCAL VOC labels

labelmap_file = 'model/voc/labelmap_voc.prototxt' file = open(labelmap_file, 'r') labelmap = caffe_pb2.LabelMap() text_format.Merge(str(file.read()), labelmap)

def get_labelname(labelmap, labels): num_labels = len(labelmap.item) labelnames = [] if type(labels) is not list: labels = [labels] for label in labels: found = False for i in xrange(0, num_labels): if label == labelmap.item[i].label: found = True labelnames.append(labelmap.item[i].display_name) break assert found == True return labelnames

model_def = 'model/voc/deploy_merged.prototxt' model_weights = 'model/voc/pelee_merged.caffemodel'

net = caffe.Net(model_def, # defines the structure of the model model_weights, # contains the trained weights caffe.TEST) # use test mode (e.g., don't perform dropout)

input preprocessing: 'data' is the name of the input blob == net.inputs[0]

transformer = caffe.io.Transformer({'data': net.blobs['data'].data.shape}) transformer.set_transpose('data', (2, 0, 1)) transformer.set_input_scale('data', 0.017) transformer.set_mean('data', np.array([103.94,116.78,123.68])) # mean pixel transformer.set_raw_scale('data', 255) # the reference model operates on images in [0,255] range instead of [0,1] transformer.set_channel_swap('data', (2,1,0)) # the reference model has channels in BGR order instead of RGB font = cv2.FONT_HERSHEY_SIMPLEX def open_cam_onboard(width, height):

On versions of L4T previous to L4T 28.1, flip-method=2

# Use Jetson onboard camera
gst_str = ("nvcamerasrc ! "
           "video/x-raw(memory:NVMM), width=(int)800, height=(int)600, format=(string)I420, framerate=(fraction)5/1 ! "
           "nvvidconv ! video/x-raw, width=(int){}, height=(int){}, format=(string)BGRx ! "
           "videoconvert ! appsink").format(width, height)
return cv2.VideoCapture(gst_str, cv2.CAP_GSTREAMER)

def do_detect(image,img2): transformed_image = transformer.preprocess('data', image) net.blobs['data'].data[...] = transformed_image

# Forward pass.
detections = net.forward()['detection_out']

# Parse the outputs.
det_label = detections[0,0,:,1]
det_conf = detections[0,0,:,2]
det_xmin = detections[0,0,:,3]
det_ymin = detections[0,0,:,4]
det_xmax = detections[0,0,:,5]
det_ymax = detections[0,0,:,6]

# Get detections with confidence higher than 0.4.
top_indices = [i for i, conf in enumerate(det_conf) if conf >= 0.4]

top_conf = det_conf[top_indices]
top_label_indices = det_label[top_indices].tolist()
top_labels = get_labelname(labelmap, top_label_indices)
top_xmin = det_xmin[top_indices]
top_ymin = det_ymin[top_indices]
top_xmax = det_xmax[top_indices]
top_ymax = det_ymax[top_indices]

for i in xrange(top_conf.shape[0]):
    xmin = int(round(top_xmin[i] * image.shape[1]))
    ymin = int(round(top_ymin[i] * image.shape[0]))
    xmax = int(round(top_xmax[i] * image.shape[1]))
    ymax = int(round(top_ymax[i] * image.shape[0]))
    score = top_conf[i]
    label_name = top_labels[i]
    img2=cv2.rectangle(img2,(xmin,ymin),(xmax,ymax),(0,255,0))
    img2=cv2.putText(img2,label_name+':'+str(score),(xmin,ymin-5),font,2,(0,0,255),1)
return img2

set net to batch size of 1

image_resize = 304 net.blobs['data'].reshape(1,3,image_resize,image_resize)

capture = open_cam_onboard(800, 600) while(True): time0=time.time() ret, frame = capture.read() frame1=cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) frame1=frame1/255. print("cap read frame time : ",time.time()-time0) time1=time.time() res=do_detect(frame1,frame) print("detect time: ",time.time()-time1) cv2.imshow('frame', res) if cv2.waitKey(1) == ord('q'): break

sparshgarg23 commented 3 years ago

What is the fps you are getting. In the paper it's mentioned

The speed is calculated by the average time of processing 100 pictures with 1 batch size. We run 100 picture processing for 10 times separately and average the time. the paper also uses FP16 instead of FP 32 to achieve the desired FPS Would be great if the author could confirm this