Open IronmanVsThanos opened 12 months ago
import argparse import random import time import numpy as np from PIL import Image import os import torchvision from torchvision.ops.boxes import batched_nms import cv2 import os import torch import numpy as np import cv2 import time from main import build_model_main from util.slconfig import SLConfig from datasets import build_dataset from util.visualizer import COCOVisualizer from util import box_ops from PIL import Image import datasets.transforms as T t_total = 0.0
def get_args_parser(): parser = argparse.ArgumentParser('Set transformer detector', add_help=False) parser.add_argument('--weights', default="/mnt/sda1/Deep_learning/code/DINO-main/logs/DINO/R50-4S-coco_city/checkpoint_best_regular.pth", type=str) parser.add_argument('--input_video', default="/mnt/sda1/Deep_learning/code/DINO-main/images/daolu1.avi", type=str) parser.add_argument('--output_dir', default="/mnt/sda1/Deep_learning/code/DINO-main/images/output/inference_result.mp4", type=str) parser.add_argument('--model_config_path', default="config/DINO/DINO_4scale.py", type=str) parser.add_argument('--device', default="cuda", type=str) return parser
def box_cxcywh_to_xyxy(x):
x_c, y_c, w, h = x.unbind(1)
# b list 里面有四个维度为100,的tensor 分别代表左上角x 左上角y 右下角x 右下角y
b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
(x_c + 0.5 * w), (y_c + 0.5 * h)]
return torch.stack(b, dim=1)
def rescale_bboxes(out_bbox, size): img_w, img_h = size
b = box_cxcywh_to_xyxy(out_bbox)
# 坐标tensor为0~1,根据size回复成相对于输入网络大小的box左上右下格式信息。
b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
return b
def filter_boxes(scores, boxes, confidence=0.7, apply_nms=True, iou=0.5):
# keep 返回的是有个有目标的 tensor 用于标定 100个目标中哪个有目标,取出对应的分数和类别
keep = scores.max(-1).values > confidence
scores, boxes = scores[keep], boxes[keep]
if apply_nms:
top_scores, labels = scores.max(-1)
keep = batched_nms(boxes, top_scores, labels, iou)
scores, boxes = scores[keep], boxes[keep]
return scores, boxes
CLASSES = ['N/A',"car", "coach", "bus", "truck", "tricycle", "person", "twowheelsvehicle", "taxi", "license_plate", "other_vehicles"]
def plot_one_box(x, img, color=None, label=None, line_thickness=2): tl = linethickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness color = color or [random.randint(0, 255) for in range(3)] c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3])) cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA) if label: tf = max(tl - 1, 1) # font thickness t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0] c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3 cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
def process_video(args,): print(args) device = torch.device(args.device) model_args = SLConfig.fromfile(args.model_config_path) model_args.device = 'cuda' model, criterion, postprocessors = build_model_main(model_args)
print(model)
checkpoint = torch.load(args.weights, map_location='cpu')
model.load_state_dict(checkpoint['model'])
_ = model.eval()
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("parameters:", n_parameters)
cap = cv2.VideoCapture(args.input_video)
width = int(cap.get(3))
height = int(cap.get(4))
fps = int(cap.get(5))
out = cv2.VideoWriter(args.output_dir, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
image_totensor = torchvision.transforms.ToTensor()
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
image_tensor = image_totensor(image)
image_tensor = torch.reshape(image_tensor,
[-1, image_tensor.shape[0], image_tensor.shape[1], image_tensor.shape[2]])
# 1, 3, 1080, 810
image_tensor = image_tensor.to(device)
time1 = time.time()
# pred_logits 1 100 92 pred_boxes 1 100 4
inference_result = model.cuda()(image_tensor)
time2 = time.time()
t_perFrame = time2 - time1
print("inference_time:", t_perFrame)
global t_total
t_total += t_perFrame
# 沿着最后一个维度softmax 维度不变,后面选择了第一个元素(第一个子数组),并且选择了该子数组的所有行和除了最后一列的所有列 最后一列是背景
# probas 100,91
probas = inference_result['pred_logits'].softmax(-1)[0, :, :-1].cpu()
# inference_result['pred_boxes'][0,]输出维度为 100 4 后面进行cxcywh to xyxy 并进行尺寸缩放
bboxes_scaled = rescale_bboxes(inference_result['pred_boxes'][0,].cpu(),
(image_tensor.shape[3], image_tensor.shape[2]))
# 筛选出可能的框数目 boxes 7,4 以及对应类别分数 scores 7, 91
scores, boxes = filter_boxes(probas, bboxes_scaled)
scores = scores.data.numpy()
boxes = boxes.data.numpy()
for i in range(boxes.shape[0]):
# 循环处理图像中的每个box argmax() 方法返回数组或张量中最大元素的索引,即最高分数的位置。
class_id = scores[i].argmax()
# 根据id拿到类别
label = CLASSES[class_id]
# max() 方法被调用来找到这个分数的最大值,即最高的置信度。
confidence = scores[i].max()
text = f"{label} {confidence:.3f}"
print(text)
image = np.array(image)
plot_one_box(boxes[i], image, label=text)
frame = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
out.write(frame)
cap.release()
out.release()
print("处理完成!用时:", t_total)
if name == 'main': parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()]) args = parser.parse_args() process_video(args)
how to inference a video ?thank U ,my hero