Open shivahanifi opened 10 months ago
The caption categories are designed and added using the following code:
parser = argparse.ArgumentParser(description='Caption format selection')
parser.add_argument('--caption', type=int, choices=[1, 2, 3, 4], default=1, help='Specify a value (1, 2, 3, 4) to determine the caption type. 1 being the most detailed and 4 being least detailed.')
args=parser.parse_args()
file_path = "/home/suka/code/Data/annotated_MDETR_test_data"
folders = sorted([f for f in os.listdir(file_path) if os.path.isdir(os.path.join(file_path, f))])
#folders = folders[2:]
for folder in folders:
folder_path = os.path.join(file_path, folder)
rgb_folders_path = os.path.join(folder_path, 'rgb_img')
normMap_folders_path = os.path.join(folder_path, 'normMap')
rgb_folders = sorted([f for f in os.listdir(rgb_folders_path) if os.path.isdir(os.path.join(rgb_folders_path, f))])
normMap_folders = sorted([f for f in os.listdir(normMap_folders_path) if os.path.isdir(os.path.join(normMap_folders_path, f))])
for i in range(min(len(rgb_folders),len(normMap_folders))-1):
# input images
images_path = os.path.join(rgb_folders_path,rgb_folders[i])
images = sorted([f for f in os.listdir(images_path) if '.xml' not in f])
# input caption
annotation_path = os.path.join(images_path, 'annotation.xml')
tree = ET.parse(annotation_path)
root = tree.getroot()
for obj in root.findall('object'):
if obj.find('name').text != 'head':
obj_info = {
'name': obj.find('name').text,
'color': obj.find('color').text,
'pose': obj.find('pose').text,
'placement': obj.find('placement').text,
'bndbox': {
'xmin': int(obj.find('bndbox/xmin').text),
'ymin': int(obj.find('bndbox/ymin').text),
'xmax': int(obj.find('bndbox/xmax').text),
'ymax': int(obj.find('bndbox/ymax').text)
}
}
# input heatmaps
normMaps_path = os.path.join(normMap_folders_path, normMap_folders[i])
normMaps = sorted([f for f in os.listdir(normMaps_path)])
for j in range(min(len(images),len(normMaps))-1):
im_path = os.path.join(images_path, images[j])
im = Image.open(im_path)
im.show()
if args.caption == 1:
caption = "Pass the " + obj_info['pose'] + " " + obj_info['color'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif args.caption == 2:
caption = "Pass the " + obj_info['pose'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif args.caption == 3:
caption = "Pass the " + obj_info['color'] + " " + obj_info['name'] + "."
elif args.caption == 4:
caption = "Pass the " + obj_info['name'] + "."
The final version of the caption has distinct prompts based on the category and level of detail included.
-cc
A. The B. This is a C. Look at the D. Point at the E. Pass the
-cd
Detail\Category | A | B | C | D | E |
---|---|---|---|---|---|
1 | "The " + Object_Pose + " " + Object_Color + " " + Object_Name + " " + Object_Placement + "." | "This is a " + Object_Pose + " " + Object_Color + " " + Object_Name + " " + Object_Placement + "." | "Look at the " + Object_Pose + " " + Object_Color + " " + Object_Name + " " + Object_Placement + "." | "Point at the " + Object_Pose + " " + Object_Color + " " + Object_Name + " " + Object_Placement + "." | "Pass the " + Object_Pose + " " + Object_Color + " " + Object_Name + " " + Object_Placement + "." |
2 | "The " + Object_Pose + " " + Object_Name + " " + Object_Placement + "." | "This is a " + Object_Pose + " " + Object_Name + " " + Object_Placement + "." | "Look at the " + Object_Pose + " " + Object_Name + " " + Object_Placement + "." | "Point at the " + Object_Pose + " " + Object_Name + " " + Object_Placement + "." | "Pass the " + Object_Pose + " " + Object_Name + " " + Object_Placement + "." |
3 | "The " + Object_Color + " " + Object_Name + "." | "This is a " + Object_Color + " " + Object_Name + "." | "Look at the " + Object_Color + " " + Object_Name + "." | "Point at the " + Object_Color + " " + Object_Name + "." | "Pass the " + Object_Color + " " + Object_Name + "." |
4 | "The " + Object_Name + "." | "This is a " + Object_Name + "." | "Look at the " + Object_Name + "." | "Point at the " + Object_Name + "." | "Pass the " + Object_Name + "." |
parser = argparse.ArgumentParser(description='Caption format selection')
parser.add_argument('-cc', '--caption_category', type=str, choices=['A', 'B', 'C', 'D', 'E'], default='A', help='Specify a value (A, B, C, D, E) to determine the caption category. A:The, B:This is a, C:Look at the, D:Point at the, E:Pass the')
parser.add_argument('-cd', '--caption_details', type=int, choices=[1, 2, 3, 4], default=1, help='Specify a detail level as (1, 2, 3, 4) to determine the caption details. 1:pose+color+name+placement, 2:pose+name+placement, 3:color+name, 4:name')
args=parser.parse_args()
# Caption creation
caption_category = args.caption_category
caption_details = args.caption_details
if caption_category == 'A':
if caption_details == 1:
caption = "The " + obj_info['pose'] + " " + obj_info['color'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 2:
caption = "The " + obj_info['pose'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 3:
caption = "The " + obj_info['color'] + " " + obj_info['name'] + "."
elif caption_details == 4:
caption = "The " + obj_info['name'] + "."
elif caption_category == 'B':
if caption_details == 1:
caption = "This is a " + obj_info['pose'] + " " + obj_info['color'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 2:
caption = "This is a " + obj_info['pose'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 3:
caption = "This is a " + obj_info['color'] + " " + obj_info['name'] + "."
elif caption_details == 4:
caption = "This is a " + obj_info['name'] + "."
elif caption_category == 'C':
if caption_details == 1:
caption = "Look at the " + obj_info['pose'] + " " + obj_info['color'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 2:
caption = "Look at the " + obj_info['pose'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 3:
caption = "Look at the " + obj_info['color'] + " " + obj_info['name'] + "."
elif caption_details == 4:
caption = "Look at the " + obj_info['name'] + "."
elif caption_category == 'D':
if caption_details == 1:
caption = "Point at the " + obj_info['pose'] + " " + obj_info['color'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 2:
caption = "Point at the " + obj_info['pose'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 3:
caption = "Point at the " + obj_info['color'] + " " + obj_info['name'] + "."
elif caption_details == 4:
caption = "Point at the " + obj_info['name'] + "."
elif caption_category == 'E':
if caption_details == 1:
caption = "Pass the " + obj_info['pose'] + " " + obj_info['color'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 2:
caption = "Pass the " + obj_info['pose'] + " " + obj_info['name'] + " " + obj_info['placement'] + "."
elif caption_details == 3:
caption = "Pass the " + obj_info['color'] + " " + obj_info['name'] + "."
elif caption_details == 4:
caption = "Pass the " + obj_info['name'] + "."
print('Caption: ', caption)
# Define caption templates
caption_templates = {
'A': {
1: "The {pose} {color} {name} {placement}.",
2: "The {pose} {name} {placement}.",
3: "The {color} {name}.",
4: "The {name}.",
},
'B': {
1: "This is a {pose} {color} {name} {placement}.",
2: "This is a {pose} {name} {placement}.",
3: "This is a {color} {name}.",
4: "This is a {name}.",
},
'C': {
1: "Look at the {pose} {color} {name} {placement}.",
2: "Look at the {pose} {name} {placement}.",
3: "Look at the {color} {name}.",
4: "Look at the {name}.",
},
'D': {
1: "Point at the {pose} {color} {name} {placement}.",
2: "Point at the {pose} {name} {placement}.",
3: "Point at the {color} {name}.",
4: "Point at the {name}.",
},
'E': {
1: "Pass the {pose} {color} {name} {placement}.",
2: "Pass the {pose} {name} {placement}.",
3: "Pass the {color} {name}.",
4: "Pass the {name}.",
}
}
# Construct caption
caption_category = args.caption_category
caption_details = args.caption_details
caption = caption_templates[caption_category][caption_details].format(**obj_info)
print('caption: ', caption)
My idea is to categorize the prompts into distinct groups based on the information they include. Each category will include the information as follows:
Sentence formation:
TBD: Another thing to consider is the verb and the form of the sentence. As observed earlier 'Pass me' makes the code confuse, however, only 'pass' can also be confusing since it implies to move the object to the direction specified.