Open wsy-yjys opened 7 months ago
还有一个疑问,RT-DETR在测试速度时,图像预处理步骤是不包括的吗?
包含后处理
,不包含图像预处理
好的,谢谢,请问Pytorch版本的图像预处理
过程的具体代码在哪?
您好,在2080Ti上使用Pytorch2.0.1测试yolov7_tiny和 rtdetr_r18vd的端到端延时(包括预处理,推理,后处理),发现rtdetr_r18vd比yolov7_tiny慢好多?
Methods | Size | Latency (end-to-end) | FPS on 2080 Ti |
---|---|---|---|
yolov7_tiny_syncbn_fast_2x64b-300e_coco | 640 | 5.7ms | 175.4 |
rtdetr_r18vd_6x_coco | 640 | 12.7ms | 78.7 |
这是我使用rtdetr_r18vd的Pytorch测速脚本
#coding=gbk
import numpy as np
import torch
from torch import nn
import sys
import cv2
sys.path.append("../..")
from src.core import YAMLConfig
import argparse
from pathlib import Path
import time
import glob
from tqdm import tqdm
import os
from torchvision.transforms import Compose, Resize, ConvertImageDtype
from calflops import calculate_flops
from collections import deque
# Parameters
IMG_FORMATS = ["bmp", "jpg", "jpeg", "png", "tif", "tiff", "dng", "webp", "mpo"]
VID_FORMATS = ["mp4", "mov", "avi", "mkv"]
IMG_FORMATS.extend([f.upper() for f in IMG_FORMATS])
VID_FORMATS.extend([f.upper() for f in VID_FORMATS])
class MyCalcFPS:
def __init__(self, nsamples: int = 100):
self.infer_time = deque(maxlen=nsamples)
def update(self, t1: float, t2):
self.infer_time.append(t2 - t1)
def accumulate(self):
if len(self.infer_time) > 1:
return np.average(self.infer_time)
else:
print("only 1 sample!!!")
return 0.0
def time_sync(self):
'''Waits for all kernels in all streams on a CUDA device to complete if cuda is available.'''
if torch.cuda.is_available():
torch.cuda.synchronize()
return time.time()
class GetFPS:
def __init__(self, model: torch.nn.Module, args):
img_path = Path(args.image)
self.device = torch.device(args.device)
self.samples = LoadData(path=img_path)
self.model = model.to(self.device)
@torch.no_grad()
def get_oneepoch_fps(self, i):
''' get fps by wsy '''
fps_calculator = MyCalcFPS()
precess_image = Preprocess(640)
for img,_,_ in tqdm(self.samples):
t1 = fps_calculator.time_sync()
# preprocess
img = precess_image(img).to(self.device)
# infer and postprocess
output = self.model(img, torch.tensor([img.shape[:2]]).to(self.device))
t2 = fps_calculator.time_sync()
fps_calculator.update(t1, t2)
infer_time = fps_calculator.accumulate()
print("Epoch {} infer_time: {:.1f}ms".format(i, infer_time * 1000.0), end=" ")
avg_fps = 1.0 / infer_time
print(f"FPS: {avg_fps:0.1f}")
return infer_time
def get_nepoch_fps(self, repeat=13):
# eval method
self.model.eval()
infer_time = []
for i in range(repeat):
infer_time_once = self.get_oneepoch_fps(i)
# warm up
if i > 3:
infer_time.append(infer_time_once)
avg_infer_time = sum(infer_time) / len(infer_time)
print("Summary:")
print("avg_infer_time: {:.1f}ms".format(avg_infer_time * 1000.0), end="")
avg_fps = 1.0 / avg_infer_time
print(f" | avg_FPS: {avg_fps:0.1f}")
class LoadData:
def __init__(self, path, webcam=False, webcam_addr=0):
self.webcam = webcam
self.webcam_addr = webcam_addr
if webcam: # if use web camera
imgp = []
vidp = [int(webcam_addr) if webcam_addr.isdigit() else webcam_addr]
else:
p = str(Path(path).resolve()) # os-agnostic absolute path
if os.path.isdir(p):
files = sorted(glob.glob(os.path.join(p, '**/*.*'), recursive=True)) # dir
elif os.path.isfile(p):
files = [p] # files
else:
raise FileNotFoundError(f'Invalid path {p}')
imgp = [i for i in files if i.split('.')[-1] in IMG_FORMATS]
vidp = [v for v in files if v.split('.')[-1] in VID_FORMATS]
self.files = imgp + vidp
self.nf = len(self.files)
self.type = 'image'
if len(vidp) > 0:
self.add_video(vidp[0]) # new video
else:
self.cap = None
# @staticmethod
def checkext(self, path):
if self.webcam:
file_type = 'video'
else:
file_type = 'image' if path.split('.')[-1].lower() in IMG_FORMATS else 'video'
return file_type
def __iter__(self):
self.count = 0
return self
def __next__(self):
if self.count == self.nf:
raise StopIteration
path = self.files[self.count]
if self.checkext(path) == 'video':
self.type = 'video'
ret_val, img = self.cap.read()
while not ret_val:
self.count += 1
self.cap.release()
if self.count == self.nf: # last video
raise StopIteration
path = self.files[self.count]
self.add_video(path)
ret_val, img = self.cap.read()
else:
# Read image
self.count += 1
img = cv2.imread(path) # BGR
return img, path, self.cap
def add_video(self, path):
self.frame = 0
self.cap = cv2.VideoCapture(path)
self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))
def __len__(self):
return self.nf # number of files
class Preprocess:
def __init__(self, size):
self.transforms = Compose([
Resize(size=[size, size], antialias=True),
# ToTensor(),
ConvertImageDtype(torch.float32)
])
def __call__(self, image):
# image = letterbox(image)[0] # Transformer has a fixed input resolution
image = image.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
image = torch.from_numpy(np.ascontiguousarray(image))
# image = image.float()
image = self.transforms(image)
return image[None]
class Model(nn.Module):
def __init__(self, confg=None, ckpt="") -> None:
super().__init__()
self.cfg = YAMLConfig(confg, resume=ckpt)
if ckpt:
checkpoint = torch.load(ckpt, map_location='cpu')
if 'ema' in checkpoint:
state = checkpoint['ema']['module']
else:
state = checkpoint['model']
# NOTE load train mode state
self.cfg.model.load_state_dict(state)
# else:
# raise AttributeError('only support resume to load model.state_dict by now.')
# convert to deploy mode
self.model = self.cfg.model.deploy()
self.postprocessor = self.cfg.postprocessor.deploy()
# print(self.postprocessor.deploy_mode)
def forward(self, images, orig_target_sizes):
outputs = self.model(images)
return self.postprocessor(outputs, orig_target_sizes)
def get_flops(model, size=640):
input_shape = (1, 3, size, size)
flops, macs, params = calculate_flops(model=model,
input_shape=input_shape,
output_as_string=True,
output_precision=4,
print_detailed=False) # do not print detailed info
print("Model FLOPs:%s MACs:%s Params:%s \n" % (flops, macs, params))
def get_argparser():
parser = argparse.ArgumentParser()
parser.add_argument("--config", default="configs/rtdetr/rtdetr_r18vd_6x_coco.yml", help="配置文件路径")
parser.add_argument("--ckpt", default=None, help="权重文件路径")
parser.add_argument("--image", default=r'E:\paper\dataset\COCO100\val2017', help="待推理图片路径")
parser.add_argument("--device", default="cuda:0", help="推理设备")
parser.add_argument("--get-flops", action="store_true", help="是否计算flops")
return parser
def main(args):
model = Model(confg=args.config, ckpt=args.ckpt)
getfps = GetFPS(model, args)
getfps.get_nepoch_fps(13)
if args.get_flops:
get_flops(model, size=640)
if __name__ == "__main__":
args = get_argparser().parse_args()
main(args)
您好,为啥您的FPS这么低,为什么我在RTX3060上使用pytorch版本,训练数据集6000张需要4-5min,测试集2000张需要45s。而yolov8只要1.3min和17s,请求大佬指导下
@txw-github 什么意思,没明白?你指的是下表中的rtdetr_r18vd_6x_coco不应该这么慢吗?
Methods | Size | Latency (end-to-end) | FPS on 2080 Ti |
---|---|---|---|
yolov7_tiny_syncbn_fast_2x64b-300e_coco | 640 | 5.7ms | 175.4 |
rtdetr_r18vd_6x_coco | 640 | 12.7ms | 78.7 |
请问RT-DETR在测试速度时,是否包括后处理过程