Open HEasoner opened 11 months ago
Hey @HEasoner, thanks for you interest!
For the evaluation, we first load the label space for each image. You may check how to write a dataloader at #1 . There are free-form text descriptions and category names in the label space. For descriptions, we query each description using GLIP to get predictions. For category names, we concatenate k categories (20 by default) as a text query to GLIP.
You may find a sample evaluation code below. Note it's incomplete and cannot run directly
predictions = []
for iidx, batch in enumerate(_iterator):
images, targets, image_ids, *_ = batch
images = images.to(device)
text_queries = targets[0].get_field('inference_obj_descriptions')
text_queries_ids = targets[0].get_field("inference_obj_description_ids")
image_size = targets[0].size
image_id = image_ids[0]
des_id_start = 0
while des_id_start < len(text_queries_ids):
# sinlge descriptions each time
if num_of_words(text_queries[des_id_start]) > 2:
description_list = remove_full_stop([text_queries[des_id_start]])
description_id_list = [text_queries_ids[des_id_start]]
des_id_start += 1
else:
description_list = remove_full_stop(text_queries[des_id_start:des_id_start+chunk_size])
description_id_list = text_queries_ids[des_id_start:des_id_start+chunk_size]
des_id_start += chunk_size
# create postive map, always use continuous labels starting from 1
continue_labels = np.arange(0, chunk_size) + class_plus
cur_queries, positive_map_label_to_token = create_queries_and_maps(continue_labels, description_list, tokenizer, cfg=cfg)
with torch.no_grad():
output = model(images, captions=[cur_queries], positive_map=positive_map_label_to_token)
output = output[0].to(cpu_device).convert(mode="xywh")
output = output.resize(image_size) # to the oringinal scale
# thresolding
if threshold is not None:
scores = output.get_field('scores')
output = output[scores > threshold]
# sorted by scores
if topk_per_eval is not None:
scores = output.get_field('scores')
_, sortIndices = scores.sort(descending=True)
output = output[sortIndices]
# topk
output = output[:topk_per_eval]
# map continuous id to description id
cont_ids_2_descript_ids = {i:v for i, v in enumerate(description_id_list)}
pred_boxes = output.bbox
pred_labels = output.get_field('labels') - class_plus # continuous ids, starting from 0
pred_scores = output.get_field('scores')
# convert continuous id to description id
for box_idx, box in enumerate(pred_boxes):
predictions.append({
"image_id": image_id,
"bbox": box.cpu().tolist(),
"description_ids": [cont_ids_2_descript_ids[pred_labels[box_idx].item()]],
"scores": [pred_scores[box_idx].item()],
})
@xiaofeng94 Thank you for your answer, but I still have some doubts. Can you give me the part of the dataloader used here? I can't reproduce it with the simple one. And then what are the values of chunk_size, class_plus in this code? Maybe I need a little more information to reproduce this code.
Hey @HEasoner, for dataloader, you may check the following snippet. chunk_size = 20
by default, class_plus
is a hyperparameter for GLIP and equals to 1 by default.
def pil_loader(path, retry=5):
# open path as file to avoid ResourceWarning (https://github.com/python-pillow/Pillow/issues/835)
ri = 0
while ri < retry:
try:
with open(path, "rb") as f:
img = Image.open(f)
return img.convert("RGB")
except:
ri += 1
def load_omnilabel_json(path_json: str, path_imgs: str):
assert isinstance(path_json, str)
ol = olt.OmniLabel(path_json)
dataset_dicts = []
for img_id in ol.image_ids:
img_sample = ol.get_image_sample(img_id)
dataset_dicts.append({
"image_id": img_sample["id"],
"file_name": os.path.join(path_imgs, img_sample["file_name"]),
"inference_obj_descriptions": [od["text"] for od in img_sample["labelspace"]],
"inference_obj_description_ids": [od["id"] for od in img_sample["labelspace"]],
})
return dataset_dicts
class OmniLabelDataset(data.Dataset):
"""`MS Coco Detection <http://mscoco.org/dataset/#detections-challenge2016>`_ Dataset.
Args:
img_folder (string): Root directory where images are downloaded to.
ann_file (string): Path to json annotation file.
transform (callable, optional): A function/transform that takes in an PIL image
and returns a transformed version. E.g, ``transforms.ToTensor``
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
"""
def __init__(self, img_folder, ann_file, transforms=None, **kwargs):
self.img_folder = img_folder
self.transforms = transforms
self.dataset_dicts = load_omnilabel_json(ann_file, img_folder)
def __getitem__(self, index):
"""
Args:
index (int): Index
Returns:
tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``.
"""
data_dict = self.dataset_dicts[index]
img_id = data_dict["image_id"]
path = data_dict["file_name"]
img = pil_loader(path)
# only support test. No box here
target = BoxList(torch.Tensor(0,4), img.size, mode="xywh").convert("xyxy")
target.add_field("inference_obj_descriptions", data_dict["inference_obj_descriptions"])
target.add_field("inference_obj_description_ids", data_dict["inference_obj_description_ids"])
if self.transforms is not None:
img = self.transforms(img)
return img, target, img_id
def __len__(self):
return len(self.dataset_dicts)
def __repr__(self):
fmt_str = "Dataset " + self.__class__.__name__ + "\n"
fmt_str += " Number of datapoints: {}\n".format(self.__len__())
fmt_str += " Root Location: {}\n".format(self.img_folder)
return fmt_str
@samschulter Hi, thanks for sharing the toolkit. If I use GLIP-L for the experiment, how do I get the result.json for evaluation?