IDEA-Research / DINO

[ICLR 2023] Official implementation of the paper "DINO: DETR with Improved DeNoising Anchor Boxes for End-to-End Object Detection"
Apache License 2.0
2.19k stars 243 forks source link

how to inference a video ? #228

Open IronmanVsThanos opened 12 months ago

IronmanVsThanos commented 12 months ago

how to inference a video ?thank U ,my hero

IronmanVsThanos commented 11 months ago

import argparse import random import time import numpy as np from PIL import Image import os import torchvision from torchvision.ops.boxes import batched_nms import cv2 import os import torch import numpy as np import cv2 import time from main import build_model_main from util.slconfig import SLConfig from datasets import build_dataset from util.visualizer import COCOVisualizer from util import box_ops from PIL import Image import datasets.transforms as T t_total = 0.0

def get_args_parser(): parser = argparse.ArgumentParser('Set transformer detector', add_help=False) parser.add_argument('--weights', default="/mnt/sda1/Deep_learning/code/DINO-main/logs/DINO/R50-4S-coco_city/checkpoint_best_regular.pth", type=str) parser.add_argument('--input_video', default="/mnt/sda1/Deep_learning/code/DINO-main/images/daolu1.avi", type=str) parser.add_argument('--output_dir', default="/mnt/sda1/Deep_learning/code/DINO-main/images/output/inference_result.mp4", type=str) parser.add_argument('--model_config_path', default="config/DINO/DINO_4scale.py", type=str) parser.add_argument('--device', default="cuda", type=str) return parser

def box_cxcywh_to_xyxy(x):

unbind 表示沿着某个维度拆开输入x ,拆为100, x的维度为100,4

x_c, y_c, w, h = x.unbind(1)
# b list 里面有四个维度为100,的tensor  分别代表左上角x 左上角y 右下角x 右下角y
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
     (x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=1)

def rescale_bboxes(out_bbox, size): img_w, img_h = size

b为100,4 的tensor 但已经转换为左上右下格式。

b = box_cxcywh_to_xyxy(out_bbox)
# 坐标tensor为0~1,根据size回复成相对于输入网络大小的box左上右下格式信息。
b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
return b

def filter_boxes(scores, boxes, confidence=0.7, apply_nms=True, iou=0.5):

boxes 100,4 scores 100,90 scores.max(-1)表示在score的最后一个维度 取该维度的最大值和最大值所在的索引。

# keep 返回的是有个有目标的 tensor 用于标定 100个目标中哪个有目标,取出对应的分数和类别
keep = scores.max(-1).values > confidence
scores, boxes = scores[keep], boxes[keep]

if apply_nms:
    top_scores, labels = scores.max(-1)
    keep = batched_nms(boxes, top_scores, labels, iou)
    scores, boxes = scores[keep], boxes[keep]

return scores, boxes

COCO classes

CLASSES = ['N/A',"car", "coach", "bus", "truck", "tricycle", "person", "twowheelsvehicle", "taxi", "license_plate", "other_vehicles"]

def plot_one_box(x, img, color=None, label=None, line_thickness=2): tl = linethickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness color = color or [random.randint(0, 255) for in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)

def process_video(args,): print(args) device = torch.device(args.device) model_args = SLConfig.fromfile(args.model_config_path) model_args.device = 'cuda' model, criterion, postprocessors = build_model_main(model_args)

model:所构建的模型 criterion:损失函数相关的 postprocessors:后处理

print(model)
checkpoint = torch.load(args.weights, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()

n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("parameters:", n_parameters)
cap = cv2.VideoCapture(args.input_video)
width = int(cap.get(3))
height = int(cap.get(4))
fps = int(cap.get(5))
out = cv2.VideoWriter(args.output_dir, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    image_totensor = torchvision.transforms.ToTensor()
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    image_tensor = image_totensor(image)
    image_tensor = torch.reshape(image_tensor,
                                 [-1, image_tensor.shape[0], image_tensor.shape[1], image_tensor.shape[2]])

    # 1, 3, 1080, 810
    image_tensor = image_tensor.to(device)
    time1 = time.time()

    # pred_logits 1 100 92    pred_boxes 1 100 4
    inference_result = model.cuda()(image_tensor)
    time2 = time.time()
    t_perFrame = time2 - time1
    print("inference_time:", t_perFrame)
    global t_total 

    t_total += t_perFrame
    # 沿着最后一个维度softmax 维度不变,后面选择了第一个元素(第一个子数组),并且选择了该子数组的所有行和除了最后一列的所有列  最后一列是背景
    # probas 100,91
    probas = inference_result['pred_logits'].softmax(-1)[0, :, :-1].cpu()
    # inference_result['pred_boxes'][0,]输出维度为 100 4   后面进行cxcywh to xyxy 并进行尺寸缩放
    bboxes_scaled = rescale_bboxes(inference_result['pred_boxes'][0,].cpu(),
                                   (image_tensor.shape[3], image_tensor.shape[2]))
    # 筛选出可能的框数目 boxes 7,4 以及对应类别分数 scores 7, 91
    scores, boxes = filter_boxes(probas, bboxes_scaled)
    scores = scores.data.numpy()
    boxes = boxes.data.numpy()
    for i in range(boxes.shape[0]):
        # 循环处理图像中的每个box argmax() 方法返回数组或张量中最大元素的索引,即最高分数的位置。
        class_id = scores[i].argmax()
        # 根据id拿到类别
        label = CLASSES[class_id]
        # max() 方法被调用来找到这个分数的最大值,即最高的置信度。
        confidence = scores[i].max()
        text = f"{label} {confidence:.3f}"
        print(text)
        image = np.array(image)
        plot_one_box(boxes[i], image, label=text)
    frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    out.write(frame)

cap.release()
out.release()
print("处理完成!用时:", t_total)

if name == 'main': parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) args = parser.parse_args() process_video(args)