Open guanhdrmq opened 11 months ago
class VQADataset(torch.utils.data.Dataset): """VQA (v2) dataset.""" def __init__(self, questions, annotations, tokenizer, image_preprocess, frcnn, frcnn_cfg): self.questions = questions self.annotations = annotations self.tokenizer = tokenizer self.image_preprocess = image_preprocess self.frcnn = frcnn self.frcnn_cfg = frcnn_cfg def __len__(self): return len(self.annotations) def __getitem__(self, idx): # answer annotation = self.annotations[idx] # question questions = self.questions[idx] image_path = id_to_filename[annotation["image_id"]] image_path = image_path.replace("./multimodal_data/vqa2/val2014/.", "", 1) text = questions['question'] inputs = self.tokenizer( text, padding="max_length", max_length=25, truncation=True, return_token_type_ids=True, return_attention_mask=True, add_special_tokens=True, return_tensors="pt") images, sizes, scales_yx = self.image_preprocess(image_path) output_dict = self.frcnn( images, sizes, scales_yx=scales_yx, padding="max_detections", max_detections=self.frcnn_cfg.max_detections, return_tensors="pt") # Very important that the boxes are normalized feature = output_dict.get("roi_features") normalized_boxes = output_dict.get("normalized_boxes") inputs.update( { "visual_embeds": feature, "visual_attention_mask": torch.ones(feature.shape[:-1], dtype=torch.float), # "visual_token_type_ids": torch.ones(feature.shape[:-1], dtype=torch.long), "output_attentions": False } ) # remove batch dimension for k, v in inputs.items(): if isinstance(v, torch.Tensor): inputs[k] = v.squeeze() # add labels labels = annotation['labels'] # print("label candidate:", labels) scores = annotation["scores"] targets = torch.zeros(len(config.id2label), dtype=torch.float) for label, score in zip(labels, scores): # print(f"Setting target at index {label} to {score}") targets[label] = score inputs["labels"] = targets inputs["text"] = text print(text) return inputs
from visualbert.processing_image import Preprocess from visualbert.visualizing_image import SingleImageViz from visualbert.modeling_frcnn import GeneralizedRCNN from visualbert.utils import Config
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) image_preprocess = Preprocess(frcnn_cfg)
from transformers import VisualBertForQuestionAnswering, AutoTokenizer, BertTokenizerFast tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa", num_labels=len(config.id2label), id2label=config.id2label, label2id=config.label2id, output_hidden_states=True)
model.to(device) model.eval()
dataset = VQADataset(questions=questions[:100], annotations=annotations[:100], tokenizer=tokenizer, image_preprocess=image_preprocess, frcnn=frcnn, frcnn_cfg=frcnn_cfg)
test_dataloader = DataLoader(dataset, batch_size=1, shuffle=False) correct = 0.0 total = 0
for batch in tqdm(test_dataloader): batch = {k: v.to(device) for k, v in batch.items()} outputs = model(**batch) logits = outputs.logits # [batchsize, 3129] , pre = torch.max(logits, 1) _, target = torch.max(batch["labels"], 1) print("prediction:", pre) print("target:", target) print("Predicted answer:", model.config.id2label[pre.item()]) print("Target answer:", model.config.id2label[target.item()]) correct += (pre == target).sum() total = total + 1 print(total)
final_acc = correct / float(len(test_dataloader.dataset)) print('Accuracy of test: %f %%' % (100 * float(final_acc)))
from visualbert.processing_image import Preprocess from visualbert.visualizing_image import SingleImageViz from visualbert.modeling_frcnn import GeneralizedRCNN from visualbert.utils import Config
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg) image_preprocess = Preprocess(frcnn_cfg)
from transformers import VisualBertForQuestionAnswering, AutoTokenizer, BertTokenizerFast tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa", num_labels=len(config.id2label), id2label=config.id2label, label2id=config.label2id, output_hidden_states=True)
model.to(device) model.eval()
dataset = VQADataset(questions=questions[:100], annotations=annotations[:100], tokenizer=tokenizer, image_preprocess=image_preprocess, frcnn=frcnn, frcnn_cfg=frcnn_cfg)
test_dataloader = DataLoader(dataset, batch_size=1, shuffle=False) correct = 0.0 total = 0
for batch in tqdm(test_dataloader): batch = {k: v.to(device) for k, v in batch.items()} outputs = model(**batch) logits = outputs.logits # [batchsize, 3129] , pre = torch.max(logits, 1) _, target = torch.max(batch["labels"], 1) print("prediction:", pre) print("target:", target) print("Predicted answer:", model.config.id2label[pre.item()]) print("Target answer:", model.config.id2label[target.item()]) correct += (pre == target).sum() total = total + 1 print(total)
final_acc = correct / float(len(test_dataloader.dataset)) print('Accuracy of test: %f %%' % (100 * float(final_acc)))