Closed fcakyon closed 1 year ago
Hi, thanks for your interest.
A naive way would be setting batch_size to 1. If you want a inference script for demo show, you can first build the dataset, dataloader, and model as tools/train.py or test.py, then use the function "extract_data" in seqtr/datasets/utils.py to extract data in a dictionary format. After this, you can freely forward these data, get model predictions, then draw bboxes or mask on your own.
Actually I have such a script, and I will add it on the next update. :).
import argparse
import os.path as osp
from typing import Sequence
from mmcv import Config, DictAction
from mmcv.utils import mkdir_or_exist
from seqtr.apis import inference_model
def parse_args():
parser = argparse.ArgumentParser(description="macvg-inference")
parser.add_argument('config', help='inference config file path.')
parser.add_argument(
'checkpoint', help='the checkpoint file to load from.')
parser.add_argument(
'--output-dir', help='directory where inference results will be saved.')
parser.add_argument('--with-gt', action='store_true',
help='draw ground-truth bbox/mask on image if true.')
parser.add_argument('--no-overlay', action='store_false', dest='overlay')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--which-set',
type=str,
nargs='+',
default='val',
help="evaluation which_sets, which depends on the dataset, e.g., \
'val', 'testA', 'testB' for RefCOCO(Plus)UNC, and 'val', 'test' for RefCOCOgUMD.")
args = parser.parse_args()
return args
if __name__ == "__main__":
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
cfg.checkpoint = args.checkpoint
assert args.which_set is not None, "please specify at least one which_set to inference on."
if isinstance(args.which_set, str):
cfg.which_set = [args.which_set]
elif isinstance(args.which_set, Sequence):
cfg.which_set = args.which_set
cfg.overlay = args.overlay
cfg.output_dir = args.output_dir
cfg.with_gt = args.with_gt
cfg.rank = 0
cfg.distributed = False
for which_set in cfg.which_set:
mkdir_or_exist(
osp.join(args.output_dir, cfg.dataset + "_" + which_set))
inference_model(cfg)
import mmcv
import torch
import os.path as osp
from seqtr.utils import load_checkpoint, get_root_logger
from seqtr.core import imshow_expr_bbox, imshow_expr_mask
from seqtr.models import build_model, ExponentialMovingAverage
from seqtr.datasets import extract_data, build_dataset, build_dataloader
try:
import apex
except:
pass
def inference_model(cfg):
datasets_cfg = [cfg.data.train]
for which_set in cfg.which_set:
datasets_cfg.append(eval(f"cfg.data.{which_set}"))
datasets = list(map(build_dataset, datasets_cfg))
dataloaders = list(
map(lambda dataset: build_dataloader(cfg, dataset), datasets))
model = build_model(cfg.model,
word_emb=datasets[0].word_emb,
num_token=datasets[0].num_token)
model = model.cuda()
if cfg.use_fp16:
model = apex.amp.initialize(
model, opt_level="O1")
for m in model.modules():
if hasattr(m, "fp16_enabled"):
m.fp16_enabled = True
if cfg.ema:
model_ema = ExponentialMovingAverage(
model, cfg.ema_factor)
else:
model_ema = None
load_checkpoint(model, model_ema, None, cfg.checkpoint)
if cfg.ema:
model_ema.apply_shadow()
model.eval()
logger = get_root_logger()
with_bbox, with_mask = False, False
for i, which_set in enumerate(cfg.which_set):
logger.info(f"inferencing on split {which_set}")
prog_bar = mmcv.ProgressBar(len(datasets[i+1]))
with torch.no_grad():
for batch, inputs in enumerate(dataloaders[i+1]):
gt_bbox, gt_mask, is_crowd = None, None, None
if 'gt_bbox' in inputs:
with_bbox = True
gt_bbox = inputs.pop('gt_bbox').data[0]
if 'gt_mask_rle' in inputs:
with_mask = True
gt_mask = inputs.pop('gt_mask_rle').data[0]
if 'is_crowd' in inputs:
inputs.pop('is_crowd').data[0]
if not cfg.distributed:
inputs = extract_data(inputs)
img_metas = inputs['img_metas']
batch_size = len(img_metas)
predictions = model(**inputs,
return_loss=False,
rescale=True,
with_bbox=with_bbox,
with_mask=with_mask)
pred_bboxes = [None for _ in range(batch_size)]
if with_bbox:
pred_bboxes = predictions.pop('pred_bboxes')
pred_masks = [None for _ in range(batch_size)]
if with_mask:
pred_masks = predictions.pop('pred_masks')
for j, (img_meta, pred_bbox, pred_mask) in enumerate(zip(img_metas, pred_bboxes, pred_masks)):
filename, expression = img_meta['filename'], img_meta['expression']
bbox_gt, mask_gt = None, None
if cfg.with_gt and with_bbox:
bbox_gt = gt_bbox[j]
if cfg.with_gt and with_mask:
mask_gt = gt_mask[j]
outfile = osp.join(
cfg.output_dir,
cfg.dataset + "_" + which_set,
expression.replace(" ", "_") + "_" + osp.basename(filename))
if with_bbox:
imshow_expr_bbox(filename,
pred_bbox,
outfile,
gt_bbox=bbox_gt)
if with_mask:
imshow_expr_mask(filename,
pred_mask,
outfile,
gt_mask=mask_gt,
overlay=cfg.overlay)
prog_bar.update()
if cfg.ema:
model_ema.restore()
Hi, you can use the above code for inferencing on a single image. Note however this is the original code and can be buggy.
Sorry, the above code seems to be used for datasets? I have no idea how to use it to infer with a single image and description as input.
@sean-zhuh yes we need a code to perform inference over single tensor, not dataset
EPS = 1e-2
def color_val_matplotlib(color):
"""Convert various input in BGR order to normalized RGB matplotlib color
tuples,
Args:
color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
Returns:
tuple[float]: A tuple of 3 normalized floats indicating RGB channels.
"""
color = mmcv.color_val(color)
color = [color / 255 for color in color[::-1]]
return tuple(color)
def imshow_expr_bbox(filename,
pred_bbox,
outfile,
gt_bbox=None,
pred_bbox_color='red',
gt_bbox_color='blue',
thickness=3):
plt.clf()
_, axe = plt.subplots()
pred_bbox_color = color_val_matplotlib(pred_bbox_color)
gt_bbox_color = color_val_matplotlib(gt_bbox_color)
img = mmcv.imread(filename).astype(numpy.uint8)
img = numpy.ascontiguousarray(img)
pred_bbox_int = pred_bbox.long().cpu()
pred_bbox_poly = [[pred_bbox_int[0], pred_bbox_int[1]], [pred_bbox_int[2], pred_bbox_int[1]],
[pred_bbox_int[2], pred_bbox_int[3]], [pred_bbox_int[0], pred_bbox_int[3]]]
pred_bbox_poly = numpy.array(pred_bbox_poly).reshape((4, 2))
pred_polygon = Polygon(pred_bbox_poly)
pred_patch = PatchCollection([pred_polygon], facecolor='none', edgecolors=[
pred_bbox_color], linewidths=thickness)
axe.add_collection(pred_patch)
if gt_bbox is not None:
gt_bbox_int = gt_bbox.long().cpu()
gt_bbox_poly = [[gt_bbox_int[0], gt_bbox_int[1]], [gt_bbox_int[0], gt_bbox_int[3]],
[gt_bbox_int[2], gt_bbox_int[3]], [gt_bbox_int[2], gt_bbox_int[1]]]
gt_bbox_poly = numpy.array(gt_bbox_poly).reshape((4, 2))
gt_polygon = Polygon(gt_bbox_poly)
gt_patch = PatchCollection(
[gt_polygon], facecolor='none', edgecolors=[gt_bbox_color], linewidths=thickness)
axe.add_collection(gt_patch)
axe.axis('off')
axe.imshow(img)
plt.savefig(outfile)
plt.close()
def imshow_expr_mask(filename,
pred_mask,
outfile,
gt_mask=None,
overlay=True):
if not overlay:
plt.clf()
plt.axis('off')
pred_mask = maskUtils.decode(pred_mask).astype(bool)
plt.imshow(pred_mask, "gray")
plt.savefig(outfile.replace(".jpg", "_pred.jpg"))
if gt_mask is not None:
plt.clf()
plt.axis('off')
gt_mask = maskUtils.decode(gt_mask).astype(bool)
assert gt_mask.shape == pred_mask.shape
plt.imshow(gt_mask, "gray")
plt.savefig(outfile.replace(".jpg", "_gt.jpg"))
plt.close()
else:
img = cv2.imread(filename)[:, :, ::-1]
height, width = img.shape[:2]
img = numpy.ascontiguousarray(img).clip(0, 255).astype(numpy.uint8)
output_pred = VisImage(img, scale=1.)
pred_mask = maskUtils.decode(pred_mask)
assert pred_mask.shape[0] == height and pred_mask.shape[1] == width
pred_mask = GenericMask(pred_mask, height, width)
for segment in pred_mask.polygons:
polygon = mpl.patches.Polygon(
segment.reshape(-1, 2),
fill=True,
facecolor=mplc.to_rgb([0.439, 0.188, 0.627]) + (0.65, ),
edgecolor=mplc.to_rgb([0., 0., 0.]) + (1, ),
linewidth=2
)
output_pred.ax.add_patch(polygon)
cv2.imwrite(outfile.replace(".jpg", "_pred.jpg"),
output_pred.get_image()[:, :, ::-1])
if gt_mask is not None:
output_gt = VisImage(img, scale=1.)
gt_mask = maskUtils.decode(gt_mask)
assert gt_mask.shape[0] == height and gt_mask.shape[1] == width
gt_mask = GenericMask(gt_mask, height, width)
for segment in gt_mask.polygons:
polygon = mpl.patches.Polygon(
segment.reshape(-1, 2),
fill=True,
facecolor=mplc.to_rgb([0.439, 0.188, 0.627]) + (0.65, ),
edgecolor=mplc.to_rgb([0., 0., 0.]) + (1, ),
linewidth=2
)
output_gt.ax.add_patch(polygon)
cv2.imwrite(outfile.replace(".jpg", "_gt.jpg"),
output_gt.get_image()[:, :, ::-1])
Some missing functions here. You can just replace the dataset with your own lists of input images, I guess nothing particularly difficult here, the logic is the same as the above code.
Hi, thanks for sharing the code! I'll try it. But I think giving a ready-made inference file would make this amazing project more popular and easier to use :-) For example: python inference.py --checkpoint_path $checkpoint_path --image_path $image_path --description_path $image_path
Yeah, but I'm preparing the job interview right now, maybe two months later I'll write the script. Please stay tuned, I'll try my best to make this repository as easy as possible.
@sean-zhuh Thanks for sharing! Do you mind sharing the Polygon function? It is missing. Is this a library or a function you wrote? Thank you so much!
Hi, you can import the function as below:
import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.colors as mplc import matplotlib.figure as mplfigure import pycocotools.mask as maskUtils from matplotlib.patches import Polygon from matplotlib.collections import PatchCollection from detectron2.utils.visualizer import GenericMask, VisImage
Hi, you can import the function as below:
import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.colors as mplc import matplotlib.figure as mplfigure import pycocotools.mask as maskUtils from matplotlib.patches import Polygon from matplotlib.collections import PatchCollection from detectron2.utils.visualizer import GenericMask, VisImage
Thank you so much! One more question, could I ask why drawing bbx for gt and drawing bbx for the prediction has different orders?
Hi, you can import the function as below: import matplotlib as mpl import matplotlib.pyplot as plt import matplotlib.colors as mplc import matplotlib.figure as mplfigure import pycocotools.mask as maskUtils from matplotlib.patches import Polygon from matplotlib.collections import PatchCollection from detectron2.utils.visualizer import GenericMask, VisImage
Thank you so much! One more question, could I ask why drawing bbx for gt and drawing bbx for the prediction has different orders?
I just realized that these two are equal. No need to reply
aha, I didn't notice this, does it make a difference ?
aha, I didn't notice this, does it make a difference ?
No. I attached the code which verifies that it makes no difference.
However, I do have some other questions:
(1) It seems that the bbx is scaled in the visualization code. Shouldn't we also scale the img when ploting?
(2) I would like to confirm whether the bbox format is in (x-away from top left corner, y-away from top-left corner,w,h) or not.
Thank you so much in advance!
` from matplotlib.patches import Polygon from matplotlib import pyplot as plt import numpy from matplotlib.collections import PatchCollection plt.clf() img = numpy.zeros((2009, 2009)) _, axe = plt.subplots() gt_bbox_int = [100,500,500,700] # (x-away from tl, y-away from tl,w,h) """ gt_bbox_poly = [[gt_bbox_int[0], gt_bbox_int[1]], [gt_bbox_int[0], gt_bbox_int[3]], [gt_bbox_int[2], gt_bbox_int[3]], [gt_bbox_int[2], gt_bbox_int[1]]]
x-away from tl, y-away from tl; x-away from tl, h; w, h; x-away from tl, y-away from tl
[[100 500] [500 500] [500 700] [100 700]]
gt_bbox_poly = [[gt_bbox_int[0], gt_bbox_int[1]], [gt_bbox_int[2], gt_bbox_int[1]], [gt_bbox_int[2], gt_bbox_int[3]], [gt_bbox_int[0], gt_bbox_int[3]]]
x-away from tl, y-away from tl; y-away from tl w, h; x-away from tl,x-away from tl, h;
[[100 500] [100 700] [500 700] [500 500]] """
gt_bbox_poly = numpy.array(gt_bbox_poly).reshape((4, 2)) print (gt_bbox_poly) gt_polygon = Polygon(gt_bbox_poly) gt_patch = PatchCollection( [gt_polygon], facecolor='none', edgecolors=['blue'], linewidths=3) axe.add_collection(gt_patch)
axe.axis('off') axe.imshow(img) plt.show()`
1) Yes 2) No, the bbox format is in [x_tl, y_tl, x_br, y_br], tl for top-left, br for bottom-right
- Yes
- No, the bbox format is in [x_tl, y_tl, x_br, y_br], tl for top-left, br for bottom-right
Thanks!
adding these lines fixed my issue. scale = img_meta["scale_factor"] gt_bbox /= gt_bbox.new_tensor(scale)
Thanks for sharing this great work!
How to perform single image and single query inference? I only see scripts for batch testing and training.
Bests