Hi, I found a way to circumvent using tsv files by modifying `scene_graph_benchmark/tools/demo/demo_image.py`, and now I only need `jpg` image dataset, VinVl yaml configuration file and model weight file. The predictions are saved in dictionary and are stored in `pth` format. I ran it on Google Colab and it generates predictions at a rate about 2s/image. I hope this helps.

Hi, I found a way to circumvent using tsv files by modifying scene_graph_benchmark/tools/demo/demo_image.py, and now I only need jpg image dataset, VinVl yaml configuration file and model weight file. The predictions are saved in dictionary and are stored in pth format. I ran it on Google Colab and it generates predictions at a rate about 2s/image. I hope this helps.

# pretrained models at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/vinvl_vg_x152c4.pth
# the associated labelmap at https://penzhanwu2.blob.core.windows.net/sgg/sgg_benchmark/vinvl_model_zoo/VG-SGG-dicts-vgoi6-clipped.json

import cv2
import os
import os.path as op
import argparse
import json
from PIL import Image

from scene_graph_benchmark.scene_parser import SceneParser
from scene_graph_benchmark.AttrRCNN import AttrRCNN
from maskrcnn_benchmark.data.transforms import build_transforms
from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
from maskrcnn_benchmark.config import cfg
from scene_graph_benchmark.config import sg_cfg
from maskrcnn_benchmark.data.datasets.utils.load_files import \
    config_dataset_file
from maskrcnn_benchmark.data.datasets.utils.load_files import load_labelmap_file
from maskrcnn_benchmark.utils.miscellaneous import mkdir

def cv2Img_to_Image(input_img):
    cv2_img = input_img.copy()
    img = cv2.cvtColor(cv2_img, cv2.COLOR_BGR2RGB)
    img = Image.fromarray(img)
    return img

def detect_objects_on_single_image(model, transforms, cv2_img):
    # cv2_img is the original input, so we can get the height and 
    # width information to scale the output boxes.
    img_input = cv2Img_to_Image(cv2_img)
    img_input, _ = transforms(img_input, target=None)
    img_input = img_input.to(model.device)

    with torch.no_grad():
        prediction = model(img_input)[0].to('cpu')
    #     prediction = prediction[0].to(torch.device("cpu"))

    img_height = cv2_img.shape[0]
    img_width = cv2_img.shape[1]

    prediction = prediction.resize((img_width, img_height))

    return prediction

#Setting configuration
cfg.set_new_allowed(True)
cfg.merge_from_other_cfg(sg_cfg)
cfg.set_new_allowed(False)
#Configuring VinVl
cfg.merge_from_file('/scene_graph_benchmark/sgg_configs/vgattr/vinvl_x152c4.yaml')

#This is a list specifying the values for additional arguments, it encompasses pairs of list and values in an ordered manner
#MODEL.WEIGHT specifies the full path of the VinVl weight pth file
#DATA_DIR specifies the directory that contains VinVl input tsv configuration yaml file
argument_list = [
                 'MODEL.WEIGHT', 'vinvl_vg_x152c4.pth',
                 'MODEL.ROI_HEADS.NMS_FILTER', 1,
                 'MODEL.ROI_HEADS.SCORE_THRESH', 0.2, 
                 'TEST.IGNORE_BOX_REGRESSION', False,
                 'MODEL.ATTRIBUTE_ON', True
                 ]
cfg.merge_from_list(argument_list)
cfg.freeze()

#     assert op.isfile(args.img_file), \
#         "Image: {} does not exist".format(args.img_file)

output_dir = cfg.OUTPUT_DIR
#     mkdir(output_dir)

model = AttrRCNN(cfg)
model.to(cfg.MODEL.DEVICE)
model.eval()

checkpointer = DetectronCheckpointer(cfg, model, save_dir=output_dir)
checkpointer.load(cfg.MODEL.WEIGHT)

transforms = build_transforms(cfg, is_train=False)

input_img_directory = 'insert your images directory path here'
#need to be pth
output_prediction_file = 'insert your output pth file path here'
dets = {}
for img_name in os.listdir(input_img_directory):
  #Convert png format to jpg format
  if img_name.split('.')[1]=='png' or img_name.split('.')[1]=='PNG':
    im = Image.open(os.path.join(input_img_directory, img_name))
    rgb_im = im.convert('RGB')
    new_name = img_name.split('.')[0]+'.jpg'
    rgb_im.save(os.path.join(input_img_directory, new_name))
    print(new_name)

  img_file_path = os.path.join(input_img_directory,img_name.split('.')[0]+'.jpg')
  print(img_file_path)

  cv2_img = cv2.imread(img_file_path)

  det = detect_objects_on_single_image(model, transforms, cv2_img)

#   prediction contains ['labels',
#  'scores',
#  'box_features',
#  'scores_all',
#  'boxes_all',
#  'attr_labels',
#  'attr_scores']
# box_features are used for oscar

  det_dict ={key : det1[0].get_field(key) for key in det1[0].fields()}

  dets[img_name.split('.')[0]] = det_dict

torch.save(dets, output_prediction_file)

_Originally posted by @SPQRXVIII001 in https://github.com/microsoft/scene_graph_benchmark/issues/7#issuecomment-904324682_

microsoft / scene_graph_benchmark