seanzhuh / SeqTR

SeqTR: A Simple yet Universal Network for Visual Grounding
https://arxiv.org/abs/2203.16265
128 stars 14 forks source link

inference api #2

Closed fcakyon closed 1 year ago

fcakyon commented 2 years ago

Thanks for sharing this great work!

How to perform single image and single query inference? I only see scripts for batch testing and training.

Bests

seanzhuh commented 2 years ago

Hi, thanks for your interest.

A naive way would be setting batch_size to 1. If you want a inference script for demo show, you can first build the dataset, dataloader, and model as tools/train.py or test.py, then use the function "extract_data" in seqtr/datasets/utils.py to extract data in a dictionary format. After this, you can freely forward these data, get model predictions, then draw bboxes or mask on your own.

Actually I have such a script, and I will add it on the next update. :).

seanzhuh commented 2 years ago
import argparse
import os.path as osp
from typing import Sequence
from mmcv import Config, DictAction
from mmcv.utils import mkdir_or_exist
from seqtr.apis import inference_model

def parse_args():
    parser = argparse.ArgumentParser(description="macvg-inference")
    parser.add_argument('config', help='inference config file path.')
    parser.add_argument(
        'checkpoint', help='the checkpoint file to load from.')
    parser.add_argument(
        '--output-dir', help='directory where inference results will be saved.')
    parser.add_argument('--with-gt', action='store_true',
                        help='draw ground-truth bbox/mask on image if true.')
    parser.add_argument('--no-overlay', action='store_false', dest='overlay')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--which-set',
        type=str,
        nargs='+',
        default='val',
        help="evaluation which_sets, which depends on the dataset, e.g., \
        'val', 'testA', 'testB' for RefCOCO(Plus)UNC, and 'val', 'test' for RefCOCOgUMD.")
    args = parser.parse_args()
    return args

if __name__ == "__main__":
    args = parse_args()
    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    cfg.checkpoint = args.checkpoint
    assert args.which_set is not None, "please specify at least one which_set to inference on."
    if isinstance(args.which_set, str):
        cfg.which_set = [args.which_set]
    elif isinstance(args.which_set, Sequence):
        cfg.which_set = args.which_set
    cfg.overlay = args.overlay
    cfg.output_dir = args.output_dir
    cfg.with_gt = args.with_gt
    cfg.rank = 0
    cfg.distributed = False

    for which_set in cfg.which_set:
        mkdir_or_exist(
            osp.join(args.output_dir, cfg.dataset + "_" + which_set))

    inference_model(cfg)
import mmcv
import torch
import os.path as osp

from seqtr.utils import load_checkpoint, get_root_logger
from seqtr.core import imshow_expr_bbox, imshow_expr_mask
from seqtr.models import build_model, ExponentialMovingAverage
from seqtr.datasets import extract_data, build_dataset, build_dataloader

try:
    import apex
except:
    pass

def inference_model(cfg):
    datasets_cfg = [cfg.data.train]
    for which_set in cfg.which_set:
        datasets_cfg.append(eval(f"cfg.data.{which_set}"))

    datasets = list(map(build_dataset, datasets_cfg))
    dataloaders = list(
        map(lambda dataset: build_dataloader(cfg, dataset), datasets))

    model = build_model(cfg.model,
                        word_emb=datasets[0].word_emb,
                        num_token=datasets[0].num_token)
    model = model.cuda()
    if cfg.use_fp16:
        model = apex.amp.initialize(
            model, opt_level="O1")
        for m in model.modules():
            if hasattr(m, "fp16_enabled"):
                m.fp16_enabled = True
    if cfg.ema:
        model_ema = ExponentialMovingAverage(
            model, cfg.ema_factor)
    else:
        model_ema = None
    load_checkpoint(model, model_ema, None, cfg.checkpoint)
    if cfg.ema:
        model_ema.apply_shadow()

    model.eval()
    logger = get_root_logger()
    with_bbox, with_mask = False, False
    for i, which_set in enumerate(cfg.which_set):
        logger.info(f"inferencing on split {which_set}")
        prog_bar = mmcv.ProgressBar(len(datasets[i+1]))
        with torch.no_grad():
            for batch, inputs in enumerate(dataloaders[i+1]):
                gt_bbox, gt_mask, is_crowd = None, None, None
                if 'gt_bbox' in inputs:
                    with_bbox = True
                    gt_bbox = inputs.pop('gt_bbox').data[0]
                if 'gt_mask_rle' in inputs:
                    with_mask = True
                    gt_mask = inputs.pop('gt_mask_rle').data[0]
                if 'is_crowd' in inputs:
                    inputs.pop('is_crowd').data[0]

                if not cfg.distributed:
                    inputs = extract_data(inputs)

                img_metas = inputs['img_metas']
                batch_size = len(img_metas)

                predictions = model(**inputs,
                                    return_loss=False,
                                    rescale=True,
                                    with_bbox=with_bbox,
                                    with_mask=with_mask)

                pred_bboxes = [None for _ in range(batch_size)]
                if with_bbox:
                    pred_bboxes = predictions.pop('pred_bboxes')
                pred_masks = [None for _ in range(batch_size)]
                if with_mask:
                    pred_masks = predictions.pop('pred_masks')

                for j, (img_meta, pred_bbox, pred_mask) in enumerate(zip(img_metas, pred_bboxes, pred_masks)):
                    filename, expression = img_meta['filename'], img_meta['expression']
                    bbox_gt, mask_gt = None, None
                    if cfg.with_gt and with_bbox:
                        bbox_gt = gt_bbox[j]
                    if cfg.with_gt and with_mask:
                        mask_gt = gt_mask[j]

                    outfile = osp.join(
                        cfg.output_dir,
                        cfg.dataset + "_" + which_set,
                        expression.replace(" ", "_") + "_" + osp.basename(filename))

                    if with_bbox:
                        imshow_expr_bbox(filename,
                                         pred_bbox,
                                         outfile,
                                         gt_bbox=bbox_gt)
                    if with_mask:
                        imshow_expr_mask(filename,
                                         pred_mask,
                                         outfile,
                                         gt_mask=mask_gt,
                                         overlay=cfg.overlay)

                    prog_bar.update()
    if cfg.ema:
        model_ema.restore()

Hi, you can use the above code for inferencing on a single image. Note however this is the original code and can be buggy.

zzh-tech commented 2 years ago

Sorry, the above code seems to be used for datasets? I have no idea how to use it to infer with a single image and description as input.

fcakyon commented 2 years ago

@sean-zhuh yes we need a code to perform inference over single tensor, not dataset

seanzhuh commented 2 years ago
EPS = 1e-2

def color_val_matplotlib(color):
    """Convert various input in BGR order to normalized RGB matplotlib color
    tuples,

    Args:
        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs

    Returns:
        tuple[float]: A tuple of 3 normalized floats indicating RGB channels.
    """
    color = mmcv.color_val(color)
    color = [color / 255 for color in color[::-1]]
    return tuple(color)

def imshow_expr_bbox(filename,
                     pred_bbox,
                     outfile,
                     gt_bbox=None,
                     pred_bbox_color='red',
                     gt_bbox_color='blue',
                     thickness=3):
    plt.clf()
    _, axe = plt.subplots()

    pred_bbox_color = color_val_matplotlib(pred_bbox_color)
    gt_bbox_color = color_val_matplotlib(gt_bbox_color)

    img = mmcv.imread(filename).astype(numpy.uint8)
    img = numpy.ascontiguousarray(img)

    pred_bbox_int = pred_bbox.long().cpu()
    pred_bbox_poly = [[pred_bbox_int[0], pred_bbox_int[1]], [pred_bbox_int[2], pred_bbox_int[1]],
                      [pred_bbox_int[2], pred_bbox_int[3]], [pred_bbox_int[0], pred_bbox_int[3]]]
    pred_bbox_poly = numpy.array(pred_bbox_poly).reshape((4, 2))
    pred_polygon = Polygon(pred_bbox_poly)
    pred_patch = PatchCollection([pred_polygon], facecolor='none', edgecolors=[
                                 pred_bbox_color], linewidths=thickness)

    axe.add_collection(pred_patch)

    if gt_bbox is not None:
        gt_bbox_int = gt_bbox.long().cpu()
        gt_bbox_poly = [[gt_bbox_int[0], gt_bbox_int[1]], [gt_bbox_int[0], gt_bbox_int[3]],
                        [gt_bbox_int[2], gt_bbox_int[3]], [gt_bbox_int[2], gt_bbox_int[1]]]
        gt_bbox_poly = numpy.array(gt_bbox_poly).reshape((4, 2))
        gt_polygon = Polygon(gt_bbox_poly)
        gt_patch = PatchCollection(
            [gt_polygon], facecolor='none', edgecolors=[gt_bbox_color], linewidths=thickness)
        axe.add_collection(gt_patch)

    axe.axis('off')
    axe.imshow(img)
    plt.savefig(outfile)

    plt.close()

def imshow_expr_mask(filename,
                     pred_mask,
                     outfile,
                     gt_mask=None,
                     overlay=True):
    if not overlay:
        plt.clf()
        plt.axis('off')
        pred_mask = maskUtils.decode(pred_mask).astype(bool)
        plt.imshow(pred_mask, "gray")
        plt.savefig(outfile.replace(".jpg", "_pred.jpg"))
        if gt_mask is not None:
            plt.clf()
            plt.axis('off')
            gt_mask = maskUtils.decode(gt_mask).astype(bool)
            assert gt_mask.shape == pred_mask.shape
            plt.imshow(gt_mask, "gray")
            plt.savefig(outfile.replace(".jpg", "_gt.jpg"))
        plt.close()
    else:
        img = cv2.imread(filename)[:, :, ::-1]
        height, width = img.shape[:2]
        img = numpy.ascontiguousarray(img).clip(0, 255).astype(numpy.uint8)
        output_pred = VisImage(img, scale=1.)
        pred_mask = maskUtils.decode(pred_mask)
        assert pred_mask.shape[0] == height and pred_mask.shape[1] == width
        pred_mask = GenericMask(pred_mask, height, width)
        for segment in pred_mask.polygons:
            polygon = mpl.patches.Polygon(
                segment.reshape(-1, 2),
                fill=True,
                facecolor=mplc.to_rgb([0.439, 0.188, 0.627]) + (0.65, ),
                edgecolor=mplc.to_rgb([0., 0., 0.]) + (1, ),
                linewidth=2
            )
            output_pred.ax.add_patch(polygon)
        cv2.imwrite(outfile.replace(".jpg", "_pred.jpg"),
                    output_pred.get_image()[:, :, ::-1])
        if gt_mask is not None:
            output_gt = VisImage(img, scale=1.)
            gt_mask = maskUtils.decode(gt_mask)
            assert gt_mask.shape[0] == height and gt_mask.shape[1] == width
            gt_mask = GenericMask(gt_mask, height, width)
            for segment in gt_mask.polygons:
                polygon = mpl.patches.Polygon(
                    segment.reshape(-1, 2),
                    fill=True,
                    facecolor=mplc.to_rgb([0.439, 0.188, 0.627]) + (0.65, ),
                    edgecolor=mplc.to_rgb([0., 0., 0.]) + (1, ),
                    linewidth=2
                )
                output_gt.ax.add_patch(polygon)
            cv2.imwrite(outfile.replace(".jpg", "_gt.jpg"),
                        output_gt.get_image()[:, :, ::-1])

Some missing functions here. You can just replace the dataset with your own lists of input images, I guess nothing particularly difficult here, the logic is the same as the above code.

zzh-tech commented 2 years ago

Hi, thanks for sharing the code! I'll try it. But I think giving a ready-made inference file would make this amazing project more popular and easier to use :-) For example: python inference.py --checkpoint_path $checkpoint_path --image_path $image_path --description_path $image_path

seanzhuh commented 2 years ago

Yeah, but I'm preparing the job interview right now, maybe two months later I'll write the script. Please stay tuned, I'll try my best to make this repository as easy as possible.

CCYChongyanChen commented 1 year ago

image @sean-zhuh Thanks for sharing! Do you mind sharing the Polygon function? It is missing. Is this a library or a function you wrote? Thank you so much!

seanzhuh commented 1 year ago

Hi, you can import the function as below:

import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.colors as mplc import matplotlib.figure as mplfigure import pycocotools.mask as maskUtils from matplotlib.patches import Polygon from matplotlib.collections import PatchCollection from detectron2.utils.visualizer import GenericMask, VisImage

CCYChongyanChen commented 1 year ago

Hi, you can import the function as below:

import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.colors as mplc import matplotlib.figure as mplfigure import pycocotools.mask as maskUtils from matplotlib.patches import Polygon from matplotlib.collections import PatchCollection from detectron2.utils.visualizer import GenericMask, VisImage

Thank you so much! One more question, could I ask why drawing bbx for gt and drawing bbx for the prediction has different orders? image

CCYChongyanChen commented 1 year ago

Hi, you can import the function as below: import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.colors as mplc import matplotlib.figure as mplfigure import pycocotools.mask as maskUtils from matplotlib.patches import Polygon from matplotlib.collections import PatchCollection from detectron2.utils.visualizer import GenericMask, VisImage

Thank you so much! One more question, could I ask why drawing bbx for gt and drawing bbx for the prediction has different orders? image

I just realized that these two are equal. No need to reply

seanzhuh commented 1 year ago

aha, I didn't notice this, does it make a difference ?

CCYChongyanChen commented 1 year ago

aha, I didn't notice this, does it make a difference ?

No. I attached the code which verifies that it makes no difference.

However, I do have some other questions: (1) It seems that the bbx is scaled in the visualization code. Shouldn't we also scale the img when ploting?
(2) I would like to confirm whether the bbox format is in (x-away from top left corner, y-away from top-left corner,w,h) or not. Thank you so much in advance!

` from matplotlib.patches import Polygon from matplotlib import pyplot as plt import numpy from matplotlib.collections import PatchCollection plt.clf() img = numpy.zeros((2009, 2009)) _, axe = plt.subplots() gt_bbox_int = [100,500,500,700] # (x-away from tl, y-away from tl,w,h) """ gt_bbox_poly = [[gt_bbox_int[0], gt_bbox_int[1]], [gt_bbox_int[0], gt_bbox_int[3]], [gt_bbox_int[2], gt_bbox_int[3]], [gt_bbox_int[2], gt_bbox_int[1]]]

x-away from tl, y-away from tl; x-away from tl, h; w, h; x-away from tl, y-away from tl

[[100 500] [500 500] [500 700] [100 700]]

gt_bbox_poly = [[gt_bbox_int[0], gt_bbox_int[1]], [gt_bbox_int[2], gt_bbox_int[1]], [gt_bbox_int[2], gt_bbox_int[3]], [gt_bbox_int[0], gt_bbox_int[3]]]

x-away from tl, y-away from tl; y-away from tl w, h; x-away from tl,x-away from tl, h;

[[100 500] [100 700] [500 700] [500 500]] """

gt_bbox_poly = numpy.array(gt_bbox_poly).reshape((4, 2)) print (gt_bbox_poly) gt_polygon = Polygon(gt_bbox_poly) gt_patch = PatchCollection( [gt_polygon], facecolor='none', edgecolors=['blue'], linewidths=3) axe.add_collection(gt_patch)

axe.axis('off') axe.imshow(img) plt.show()`

seanzhuh commented 1 year ago

1) Yes 2) No, the bbox format is in [x_tl, y_tl, x_br, y_br], tl for top-left, br for bottom-right

CCYChongyanChen commented 1 year ago
  1. Yes
  2. No, the bbox format is in [x_tl, y_tl, x_br, y_br], tl for top-left, br for bottom-right

Thanks!

  1. It seems that in the inference API, the image and the prediction are currently at the original size, but the ground truth is scaled.
  2. Is the bbx in the json file in [x_tl, y_tl, w,h] format and the bbox in the inference file in [x_tl, y_tl, x_br, y_br] format?

adding these lines fixed my issue. scale = img_meta["scale_factor"] gt_bbox /= gt_bbox.new_tensor(scale)