lyuwenyu / RT-DETR

[CVPR 2024] Official RT-DETR (RTDETR paddle pytorch), Real-Time DEtection TRansformer, DETRs Beat YOLOs on Real-time Object Detection. 🔥 🔥 🔥
Apache License 2.0
2.33k stars 265 forks source link

关于速度 #248

Open wsy-yjys opened 6 months ago

wsy-yjys commented 6 months ago

请问RT-DETR在测试速度时,是否包括后处理过程

results = postprocessors(outputs, orig_target_sizes)
wsy-yjys commented 6 months ago

还有一个疑问,RT-DETR在测试速度时,图像预处理步骤是不包括的吗?

lyuwenyu commented 6 months ago

包含后处理不包含图像预处理

wsy-yjys commented 6 months ago

好的,谢谢,请问Pytorch版本的图像预处理过程的具体代码在哪?

lyuwenyu commented 6 months ago

https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml#L29-L35

wsy-yjys commented 6 months ago

您好,在2080Ti上使用Pytorch2.0.1测试yolov7_tiny和 rtdetr_r18vd的端到端延时(包括预处理,推理,后处理),发现rtdetr_r18vd比yolov7_tiny慢好多?

Methods Size Latency (end-to-end) FPS on 2080 Ti
yolov7_tiny_syncbn_fast_2x64b-300e_coco 640 5.7ms 175.4
rtdetr_r18vd_6x_coco 640 12.7ms 78.7
wsy-yjys commented 6 months ago

这是我使用rtdetr_r18vd的Pytorch测速脚本

#coding=gbk
import numpy as np
import torch
from torch import nn
import sys
import cv2
sys.path.append("../..")

from src.core import YAMLConfig
import argparse
from pathlib import Path
import time
import glob
from tqdm import tqdm
import os
from torchvision.transforms import Compose, Resize, ConvertImageDtype
from calflops import calculate_flops
from collections import deque

# Parameters
IMG_FORMATS = ["bmp", "jpg", "jpeg", "png", "tif", "tiff", "dng", "webp", "mpo"]
VID_FORMATS = ["mp4", "mov", "avi", "mkv"]
IMG_FORMATS.extend([f.upper() for f in IMG_FORMATS])
VID_FORMATS.extend([f.upper() for f in VID_FORMATS])

class MyCalcFPS:
    def __init__(self, nsamples: int = 100):
        self.infer_time = deque(maxlen=nsamples)

    def update(self, t1: float, t2):
        self.infer_time.append(t2 - t1)

    def accumulate(self):
        if len(self.infer_time) > 1:
            return np.average(self.infer_time)
        else:
            print("only 1 sample!!!")
            return 0.0

    def time_sync(self):
        '''Waits for all kernels in all streams on a CUDA device to complete if cuda is available.'''
        if torch.cuda.is_available():
            torch.cuda.synchronize()
        return time.time()

class GetFPS:
    def __init__(self, model: torch.nn.Module, args):
        img_path = Path(args.image)
        self.device = torch.device(args.device)
        self.samples = LoadData(path=img_path)
        self.model = model.to(self.device)

    @torch.no_grad()
    def get_oneepoch_fps(self, i):
        ''' get fps by wsy '''
        fps_calculator = MyCalcFPS()
        precess_image = Preprocess(640)
        for img,_,_ in tqdm(self.samples):
            t1 = fps_calculator.time_sync()
            # preprocess
            img = precess_image(img).to(self.device)
            # infer and postprocess
            output = self.model(img, torch.tensor([img.shape[:2]]).to(self.device))
            t2 = fps_calculator.time_sync()
            fps_calculator.update(t1, t2)

        infer_time = fps_calculator.accumulate()
        print("Epoch {}  infer_time: {:.1f}ms".format(i, infer_time * 1000.0), end=" ")
        avg_fps = 1.0 / infer_time
        print(f"FPS: {avg_fps:0.1f}")

        return infer_time

    def get_nepoch_fps(self, repeat=13):
        # eval method
        self.model.eval()

        infer_time = []
        for i in range(repeat):
            infer_time_once = self.get_oneepoch_fps(i)
            # warm up
            if i > 3:
                infer_time.append(infer_time_once)

        avg_infer_time = sum(infer_time) / len(infer_time)
        print("Summary:")
        print("avg_infer_time: {:.1f}ms".format(avg_infer_time * 1000.0), end="")
        avg_fps = 1.0 / avg_infer_time
        print(f" | avg_FPS: {avg_fps:0.1f}")

class LoadData:
    def __init__(self, path, webcam=False, webcam_addr=0):
        self.webcam = webcam
        self.webcam_addr = webcam_addr
        if webcam:  # if use web camera
            imgp = []
            vidp = [int(webcam_addr) if webcam_addr.isdigit() else webcam_addr]
        else:
            p = str(Path(path).resolve())  # os-agnostic absolute path
            if os.path.isdir(p):
                files = sorted(glob.glob(os.path.join(p, '**/*.*'), recursive=True))  # dir
            elif os.path.isfile(p):
                files = [p]  # files
            else:
                raise FileNotFoundError(f'Invalid path {p}')
            imgp = [i for i in files if i.split('.')[-1] in IMG_FORMATS]
            vidp = [v for v in files if v.split('.')[-1] in VID_FORMATS]
        self.files = imgp + vidp
        self.nf = len(self.files)
        self.type = 'image'
        if len(vidp) > 0:
            self.add_video(vidp[0])  # new video
        else:
            self.cap = None

    # @staticmethod
    def checkext(self, path):
        if self.webcam:
            file_type = 'video'
        else:
            file_type = 'image' if path.split('.')[-1].lower() in IMG_FORMATS else 'video'
        return file_type

    def __iter__(self):
        self.count = 0
        return self

    def __next__(self):
        if self.count == self.nf:
            raise StopIteration
        path = self.files[self.count]
        if self.checkext(path) == 'video':
            self.type = 'video'
            ret_val, img = self.cap.read()
            while not ret_val:
                self.count += 1
                self.cap.release()
                if self.count == self.nf:  # last video
                    raise StopIteration
                path = self.files[self.count]
                self.add_video(path)
                ret_val, img = self.cap.read()
        else:
            # Read image
            self.count += 1
            img = cv2.imread(path)  # BGR
        return img, path, self.cap

    def add_video(self, path):
        self.frame = 0
        self.cap = cv2.VideoCapture(path)
        self.frames = int(self.cap.get(cv2.CAP_PROP_FRAME_COUNT))

    def __len__(self):
        return self.nf  # number of files

class Preprocess:
    def __init__(self, size):
        self.transforms = Compose([
            Resize(size=[size, size], antialias=True),
            # ToTensor(),
            ConvertImageDtype(torch.float32)
        ])

    def __call__(self, image):
        # image = letterbox(image)[0]           # Transformer has a fixed input resolution
        image = image.transpose((2, 0, 1))[::-1]  # HWC to CHW, BGR to RGB
        image = torch.from_numpy(np.ascontiguousarray(image))
        # image = image.float()
        image = self.transforms(image)
        return image[None]

class Model(nn.Module):
    def __init__(self, confg=None, ckpt="") -> None:
        super().__init__()
        self.cfg = YAMLConfig(confg, resume=ckpt)
        if ckpt:
            checkpoint = torch.load(ckpt, map_location='cpu')
            if 'ema' in checkpoint:
                state = checkpoint['ema']['module']
            else:
                state = checkpoint['model']
            # NOTE load train mode state
            self.cfg.model.load_state_dict(state)
        # else:
        #     raise AttributeError('only support resume to load model.state_dict by now.')

        # convert to deploy mode
        self.model = self.cfg.model.deploy()
        self.postprocessor = self.cfg.postprocessor.deploy()
        # print(self.postprocessor.deploy_mode)

    def forward(self, images, orig_target_sizes):
        outputs = self.model(images)
        return self.postprocessor(outputs, orig_target_sizes)

def get_flops(model, size=640):
    input_shape = (1, 3, size, size)
    flops, macs, params = calculate_flops(model=model,
                                          input_shape=input_shape,
                                          output_as_string=True,
                                          output_precision=4,
                                          print_detailed=False)  # do not print detailed info
    print("Model FLOPs:%s   MACs:%s   Params:%s \n" % (flops, macs, params))

def get_argparser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", default="configs/rtdetr/rtdetr_r18vd_6x_coco.yml", help="配置文件路径")
    parser.add_argument("--ckpt", default=None, help="权重文件路径")
    parser.add_argument("--image", default=r'E:\paper\dataset\COCO100\val2017', help="待推理图片路径")
    parser.add_argument("--device", default="cuda:0", help="推理设备")
    parser.add_argument("--get-flops", action="store_true", help="是否计算flops")

    return parser

def main(args):
    model = Model(confg=args.config, ckpt=args.ckpt)
    getfps = GetFPS(model, args)
    getfps.get_nepoch_fps(13)
    if args.get_flops:
        get_flops(model, size=640)

if __name__ == "__main__":
    args = get_argparser().parse_args()
    main(args)
txw-github commented 4 months ago

您好,为啥您的FPS这么低,为什么我在RTX3060上使用pytorch版本,训练数据集6000张需要4-5min,测试集2000张需要45s。而yolov8只要1.3min和17s,请求大佬指导下

wsy-yjys commented 4 months ago

@txw-github 什么意思,没明白?你指的是下表中的rtdetr_r18vd_6x_coco不应该这么慢吗?

Methods Size Latency (end-to-end) FPS on 2080 Ti
yolov7_tiny_syncbn_fast_2x64b-300e_coco 640 5.7ms 175.4
rtdetr_r18vd_6x_coco 640 12.7ms 78.7