WongKinYiu / yolov7

Implementation of paper - YOLOv7: Trainable bag-of-freebies sets new state-of-the-art for real-time object detectors
GNU General Public License v3.0
13.37k stars 4.22k forks source link

!!! How can I save bounding box information and keypoints value to a txt file !!! #1457

Open Owen-Owen opened 1 year ago

Owen-Owen commented 1 year ago

When I am running yolov7-pose branch there is a --save-txt in detect.py It just save the bounding box information But I want bbox and kpts info in a txt file who can help !!!

cwx-cv commented 1 year ago

+1

njc44 commented 1 year ago

When I am running yolov7-pose branch there is a --save-txt in detect.py It just save the bounding box information But I want bbox and kpts info in a txt file who can help !!!

@Owen-Owen @computer-vision666

!git clone https://github.com/WongKinYiu/yolov7.git

cd yolov7

import argparse
import time
from pathlib import Path
import numpy

import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random

from models.experimental import attempt_load
from utils.datasets import LoadStreams, LoadImages
from utils.general import check_img_size, check_requirements, check_imshow, non_max_suppression, apply_classifier, \
    scale_coords, xyxy2xywh, strip_optimizer, set_logging, increment_path
from utils.plots import plot_one_box
from utils.torch_utils import select_device, load_classifier, time_synchronized, TracedModel

def detect(save_img=False):
    bbox = None
    source, weights, view_img, save_txt, imgsz, trace = opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size, not opt.no_trace
    save_img = not opt.nosave and not source.endswith('.txt')  # save inference images
    webcam = source.isnumeric() or source.endswith('.txt') or source.lower().startswith(
        ('rtsp://', 'rtmp://', 'http://', 'https://'))

    # Directories
    save_dir = Path(increment_path(Path(opt.project) / opt.name, exist_ok=opt.exist_ok))  # increment run
    (save_dir / 'labels' if save_txt else save_dir).mkdir(parents=True, exist_ok=True)  # make dir

    # Initialize
    set_logging()
    device = select_device(opt.device)
    half = device.type != 'cpu'  # half precision only supported on CUDA

    # Load model
    model = attempt_load(weights, map_location=device)  # load FP32 model
    stride = int(model.stride.max())  # model stride
    imgsz = check_img_size(imgsz, s=stride)  # check img_size

    if trace:
        model = TracedModel(model, device, opt.img_size)

    if half:
        model.half()  # to FP16

    # Second-stage classifier
    classify = False
    if classify:
        modelc = load_classifier(name='resnet101', n=2)  # initialize
        modelc.load_state_dict(torch.load('weights/resnet101.pt', map_location=device)['model']).to(device).eval()

    # Set Dataloader
    vid_path, vid_writer = None, None
    if webcam:
        view_img = check_imshow()
        cudnn.benchmark = True  # set True to speed up constant image size inference
        dataset = LoadStreams(source, img_size=imgsz, stride=stride)
    else:
        dataset = LoadImages(source, img_size=imgsz, stride=stride)

    # Get names and colors
    names = model.module.names if hasattr(model, 'module') else model.names
    colors = [[random.randint(0, 255) for _ in range(3)] for _ in names]

    # Run inference
    if device.type != 'cpu':
        model(torch.zeros(1, 3, imgsz, imgsz).to(device).type_as(next(model.parameters())))  # run once
    old_img_w = old_img_h = imgsz
    old_img_b = 1

    t0 = time.time()
    for path, img, im0s, vid_cap in dataset:
        img = torch.from_numpy(img).to(device)
        img = img.half() if half else img.float()  # uint8 to fp16/32
        img /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        # Warmup
        if device.type != 'cpu' and (old_img_b != img.shape[0] or old_img_h != img.shape[2] or old_img_w != img.shape[3]):
            old_img_b = img.shape[0]
            old_img_h = img.shape[2]
            old_img_w = img.shape[3]
            for i in range(3):
                model(img, augment=opt.augment)[0]

        # Inference
        t1 = time_synchronized()
        with torch.no_grad():   # Calculating gradients would cause a GPU memory leak
            pred = model(img, augment=opt.augment)[0]
        t2 = time_synchronized()

        # Apply NMS
        pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
        t3 = time_synchronized()

        # Apply Classifier
        if classify:
            pred = apply_classifier(pred, modelc, img, im0s)

        # Process detections
        for i, det in enumerate(pred):  # detections per image
            if webcam:  # batch_size >= 1
                p, s, im0, frame = path[i], '%g: ' % i, im0s[i].copy(), dataset.count
            else:
                p, s, im0, frame = path, '', im0s, getattr(dataset, 'frame', 0)

            p = Path(p)  # to Path
            save_path = str(save_dir / p.name)  # img.jpg
            txt_path = str(save_dir / 'labels' / p.stem) + ('' if dataset.mode == 'image' else f'_{frame}')  # img.txt
            gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh
            if len(det):
                # Rescale boxes from img_size to im0 size
                det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
#                 print(f"BOXES ---->>>> {det[:, :4]}")
                bbox=det[:, :4]
                bbox = bbox.numpy()

               # Print results
#                 for c in det[:, -1].unique():
#                     n = (det[:, -1] == c).sum()  # detections per class
#                     s += f"{n} {names[int(c)]}{'s' * (n > 1)}, "  # add to string

#                 # Write results
#                 for *xyxy, conf, cls in reversed(det):
#                     if save_txt:  # Write to file
#                         xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
#                         line = (cls, *xywh, conf) if opt.save_conf else (cls, *xywh)  # label format
#                         with open(txt_path + '.txt', 'a') as f:
#                             f.write(('%g ' * len(line)).rstrip() % line + '\n')

#                     if save_img or view_img:  # Add bbox to image
#                         label = f'{names[int(cls)]} {conf:.2f}'
#                         plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=1)

#             # Print time (inference + NMS)
#             print(f'{s}Done. ({(1E3 * (t2 - t1)):.1f}ms) Inference, ({(1E3 * (t3 - t2)):.1f}ms) NMS')

#             # Stream results
#             if view_img:
#                 cv2.imshow(str(p), im0)
#                 cv2.waitKey(1)  # 1 millisecond

#             # Save results (image with detections)
#             if save_img:
#                 if dataset.mode == 'image':
#                     Image.fromarray(im0).resize((300,250)).show()
# #                     cv2.imwrite(save_path, im0)
# #                     print(f" The image with the result is saved in: {save_path}")
#                       print()
#                 else:  # 'video' or 'stream'
#                     if vid_path != save_path:  # new video
#                         vid_path = save_path
#                         if isinstance(vid_writer, cv2.VideoWriter):
#                             vid_writer.release()  # release previous video writer
#                         if vid_cap:  # video
#                             fps = vid_cap.get(cv2.CAP_PROP_FPS)
#                             w = int(vid_cap.get(cv2.CAP_PROP_FRAME_WIDTH))
#                             h = int(vid_cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
#                         else:  # stream
#                             fps, w, h = 30, im0.shape[1], im0.shape[0]
#                             save_path += '.mp4'
#                         vid_writer = cv2.VideoWriter(save_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))
#                     vid_writer.write(im0)

#     if save_txt or save_img:
#         s = f"\n{len(list(save_dir.glob('labels/*.txt')))} labels saved to {save_dir / 'labels'}" if save_txt else ''
#         #print(f"Results saved to {save_dir}{s}")

#     print(f'Done. ({time.time() - t0:.3f}s)')
    return bbox

class options:
        def __init__(self, weights, source, img_size=640, conf_thres=0.1, iou_thres=0.45, device='', 
                     view_img=False, save_txt=False, save_conf=False, nosave=False, classes=None, 
                     agnostic_nms=False, augment=False, update=False, project='runs/detect', name='exp', 
                     exist_ok=False, no_trace=False):
            self.weights=weights
            self.source=source
            self.img_size=img_size
            self.conf_thres=conf_thres
            self.iou_thres=iou_thres
            self.device=device
            self.view_img=view_img
            self.save_txt=save_txt
            self.save_conf=save_conf
            self.nosave=nosave
            self.classes=classes
            self.agnostic_nms=agnostic_nms
            self.augment=augment
            self.update=update
            self.project=project
            self.name=name
            self.exist_ok=exist_ok
            self.no_trace=no_trace

 if __name__ == '__main__':    
    path = <path to pretrained model> 
    source = <path to image dir to be inferenced>
    opt = options(weights=f'{path}/best.pt',source=source)
    print(opt)
    #check_requirements(exclude=('pycocotools', 'thop'))
    bbox = None
    with torch.no_grad():
        if opt.update:  # update all models (to fix SourceChangeWarning)
            for opt.weights in ['yolov7.pt']:
                bbox = detect()
                strip_optimizer(opt.weights)
        else:
            bbox = detect()

I edited the code from detect.py to get the bbox of an image in source directory. You will have to enter path to model weights and source to the image directory according to it's respective location

You also might have to change the bbox variable in the detect() function if there's more than one image present in the directory, If you have multiple images you could change it to a key value pair using the filename as the key and det[:, :4] as the bbox but this should give you a general idea of how to get the bbox coordinates.

Hope this helps!!

Owen-Owen commented 1 year ago

thank you so much, sir ! but i want bbox info and kpts info both write to a txt file. i make it by myself , but i meet a new problem, the bbox info is normalized but the kpts info is raw. so how to get normalized kpts info!! tanks again.

rd-neosoft commented 1 year ago

from pose branch use detect.py. Divide x,y keypoint coordinates by image width and height coordinate_x / width and coordinate_y / height make sure you are using coordinates that are re-scaled from img_size to im0 size and then divide by original image shape. once you get the normalised coordinates just save them in whatever file format you want

Owen-Owen commented 1 year ago

yeah,but how can i make it.

rd-neosoft commented 1 year ago

using this code you get all 17 keypoints in following format [[0.345, 0.543, 0.6], [0.124, 0.653, 0.8] ..... [0.345, 0.543, 0.6]] where innner list is [norm_cord_x, norm_cord_y, norm_conf] In pose branch detect.py write this code below following line kpts = det[det_index, 6:]

new_kpts = kpts.cpu().detach().numpy().tolist()  
norm_kpts = [
    [
        new_kpts[idx] / im0.shape[1],
        new_kpts[idx + 1] / im0.shape[0],
        round(new_kpts[idx + 2], 3)
    ]
    for idx in range(0, len(new_kpts), 3)
]
output_path = "#"
with open(output_path, "w") as output:
    output.write(str(norm_kpts))

Note: this is a sample code and saves keypoints in a separate file, you can modify it according to your requirements.

Owen-Owen commented 1 year ago

@rd-neosoft hello, thank u so much sir, may i ask what's the third number mean? if this number means 0=unmarked, 1=marked but invisible, 2=marked and visible. so how can i get this unchanged number?

zhangYQHBAU commented 4 months ago

the precision and recall is low in training stage for the yolov7-pose? I can not figure out the reason.