Closed inafi closed 4 years ago
Can you provide exact log/screenshot of the error? It is probably happening because you haven't downloaded textvqa's imdb and features as mentioned in getting started.
The custom images I want to run are in the "~/pythia/images" folder (main pythia directory). Here's the error:
(base) inafi@cm:~/pythia$ python tools/run.py --tasks vqa --datasets images/person.jpg --model lorra --config configs/vqa/textvqa/lorra.yml --run_type inference --evalai_inference 1 --resume_file data/models/lorra_best.pth --num_workers 0
Logging to: ./save/vqa_images/person.jpg_lorra/logs/vqa_images/person.jpg_lorra_2019-12-06T08:14:05.log
Traceback (most recent call last):
File "tools/run.py", line 94, in
This is not the way the direct inference is meant to use. If you are looking to use a setup like colab demo, I would suggest you to copy the colab's code to a script and add your own arguments on top of it with ArgumentParser
for the fields that we specify in colab via text fields.
More details on why what you are doing won't work:
In inference mode inside the pythia repo, the dataset passed for --datasets
expects a dataset to be registered with pythia. See how to register a new dataset in pythia at https://learnpythia.readthedocs.io/en/latest/tutorials/dataset.html. Then, the pythia model expects features of the image to be passed back from the dataset. Thus, passing the image directly won't work.
I successfully converted the notebook to a py file (it works!). Here's the code:
from pythia.common.sample import Sample, SampleList
from pythia.common.registry import registry
from pythia.models.pythia import Pythia
from pythia.tasks.processors import VocabProcessor, VQAAnswerProcessor
from pythia.utils.configuration import ConfigNode
from maskrcnn_benchmark.utils.model_serialization import load_state_dict
from maskrcnn_benchmark.structures.image_list import to_image_list
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.layers import nms
from maskrcnn_benchmark.config import cfg
from io import BytesIO
from ipywidgets import widgets, Layout
from IPython.display import display, HTML, clear_output
from PIL import Image
import torchvision.transforms as transforms
import torchvision.models as models
import pandas as pd
import torch.nn.functional as F
import gc
import numpy as np
import requests
import torch
import cv2
import yaml
import sys
import time
start = time.time()
sys.path.append('content/pythia')
sys.path.append('content/vqa-maskrcnn-benchmark')`
class PythiaDemo:
TARGET_IMAGE_SIZE = [448, 448]
CHANNEL_MEAN = [0.485, 0.456, 0.406]
CHANNEL_STD = [0.229, 0.224, 0.225]
def __init__(self):
self._init_processors()
self.pythia_model = self._build_pythia_model()
self.detection_model = self._build_detection_model()
self.resnet_model = self._build_resnet_model()
def _init_processors(self):
with open("content/model_data/pythia.yaml") as f:
config = yaml.load(f)
config = ConfigNode(config)
# Remove warning
config.training_parameters.evalai_inference = True
registry.register("config", config)
self.config = config
vqa_config = config.task_attributes.vqa.dataset_attributes.vqa2
text_processor_config = vqa_config.processors.text_processor
answer_processor_config = vqa_config.processors.answer_processor
text_processor_config.params.vocab.vocab_file = "content/model_data/vocabulary_100k.txt"
answer_processor_config.params.vocab_file = "content/model_data/answers_vqa.txt"
# Add preprocessor as that will needed when we are getting questions from user
self.text_processor = VocabProcessor(text_processor_config.params)
self.answer_processor = VQAAnswerProcessor(
answer_processor_config.params)
registry.register("vqa2_text_processor", self.text_processor)
registry.register("vqa2_answer_processor", self.answer_processor)
registry.register("vqa2_num_final_outputs",
self.answer_processor.get_vocab_size())
def _build_pythia_model(self):
state_dict = torch.load('content/model_data/pythia.pth')
model_config = self.config.model_attributes.pythia
model_config.model_data_dir = "content/"
model = Pythia(model_config)
model.build()
model.init_losses_and_metrics()
if list(state_dict.keys())[0].startswith('module') and \
not hasattr(model, 'module'):
state_dict = self._multi_gpu_state_to_single(state_dict)
model.load_state_dict(state_dict)
model.to("cuda")
model.eval()
return model
def _build_resnet_model(self):
self.data_transforms = transforms.Compose([
transforms.Resize(self.TARGET_IMAGE_SIZE),
transforms.ToTensor(),
transforms.Normalize(self.CHANNEL_MEAN, self.CHANNEL_STD),
])
resnet152 = models.resnet152(pretrained=True)
resnet152.eval()
modules = list(resnet152.children())[:-2]
self.resnet152_model = torch.nn.Sequential(*modules)
self.resnet152_model.to("cuda")
def _multi_gpu_state_to_single(self, state_dict):
new_sd = {}
for k, v in state_dict.items():
if not k.startswith('module.'):
raise TypeError("Not a multiple GPU state of dict")
k1 = k[7:]
new_sd[k1] = v
return new_sd
def predict(self, url, question):
with torch.no_grad():
detectron_features = self.get_detectron_features(url)
resnet_features = self.get_resnet_features(url)
sample = Sample()
processed_text = self.text_processor({"text": question})
sample.text = processed_text["text"]
sample.text_len = len(processed_text["tokens"])
sample.image_feature_0 = detectron_features
sample.image_info_0 = Sample({
"max_features": torch.tensor(100, dtype=torch.long)
})
sample.image_feature_1 = resnet_features
sample_list = SampleList([sample])
sample_list = sample_list.to("cuda")
scores = self.pythia_model(sample_list)["scores"]
scores = torch.nn.functional.softmax(scores, dim=1)
actual, indices = scores.topk(5, dim=1)
top_indices = indices[0]
top_scores = actual[0]
probs = []
answers = []
for idx, score in enumerate(top_scores):
probs.append(score.item())
answers.append(
self.answer_processor.idx2word(top_indices[idx].item())
)
gc.collect()
torch.cuda.empty_cache()
return probs, answers
def _build_detection_model(self):
cfg.merge_from_file('content/model_data/detectron_model.yaml')
cfg.freeze()
model = build_detection_model(cfg)
checkpoint = torch.load('content/model_data/detectron_model.pth',
map_location=torch.device("cpu"))
load_state_dict(model, checkpoint.pop("model"))
model.to("cuda")
model.eval()
return model
def get_actual_image(self, image_path):
if image_path.startswith('http'):
path = requests.get(image_path, stream=True).raw
else:
path = image_path
return path
def _image_transform(self, image_path):
path = self.get_actual_image(image_path)
img = Image.open(path)
im = np.array(img).astype(np.float32)
im = im[:, :, ::-1]
im -= np.array([102.9801, 115.9465, 122.7717])
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(800) / float(im_size_min)
# Prevent the biggest axis from being more than max_size
if np.round(im_scale * im_size_max) > 1333:
im_scale = float(1333) / float(im_size_max)
im = cv2.resize(
im,
None,
None,
fx=im_scale,
fy=im_scale,
interpolation=cv2.INTER_LINEAR
)
img = torch.from_numpy(im).permute(2, 0, 1)
return img, im_scale
def _process_feature_extraction(self, output,
im_scales,
feat_name='fc6',
conf_thresh=0.2):
batch_size = len(output[0]["proposals"])
n_boxes_per_image = [len(_) for _ in output[0]["proposals"]]
score_list = output[0]["scores"].split(n_boxes_per_image)
score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
feats = output[0][feat_name].split(n_boxes_per_image)
cur_device = score_list[0].device
feat_list = []
for i in range(batch_size):
dets = output[0]["proposals"][i].bbox / im_scales[i]
scores = score_list[i]
max_conf = torch.zeros((scores.shape[0])).to(cur_device)
for cls_ind in range(1, scores.shape[1]):
cls_scores = scores[:, cls_ind]
keep = nms(dets, cls_scores, 0.5)
max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep],
cls_scores[keep],
max_conf[keep])
keep_boxes = torch.argsort(max_conf, descending=True)[:100]
feat_list.append(feats[i][keep_boxes])
return feat_list
def masked_unk_softmax(self, x, dim, mask_idx):
x1 = F.softmax(x, dim=dim)
x1[:, mask_idx] = 0
x1_sum = torch.sum(x1, dim=1, keepdim=True)
y = x1 / x1_sum
return y
def get_resnet_features(self, image_path):
path = self.get_actual_image(image_path)
img = Image.open(path).convert("RGB")
img_transform = self.data_transforms(img)
if img_transform.shape[0] == 1:
img_transform = img_transform.expand(3, -1, -1)
img_transform = img_transform.unsqueeze(0).to("cuda")
features = self.resnet152_model(img_transform).permute(0, 2, 3, 1)
features = features.view(196, 2048)
return features
def get_detectron_features(self, image_path):
im, im_scale = self._image_transform(image_path)
img_tensor, im_scales = [im], [im_scale]
current_img_list = to_image_list(img_tensor, size_divisible=32)
current_img_list = current_img_list.to('cuda')
with torch.no_grad():
output = self.detection_model(current_img_list)
feat_list = self._process_feature_extraction(output, im_scales, 'fc6', 0.2)
return feat_list[0]
demo = PythiaDemo()
#image_text = "http://images.cocodataset.org/train2017/000000505539.jpg"
image_text = "giraffe.jpg"
question_text = str(cli.get("pythia-question"))
clear_output()
image_path = demo.get_actual_image(image_text)
image = Image.open(image_path)
scores, predictions = demo.predict(image_text, question_text)
scores = [score * 100 for score in scores]
df = pd.DataFrame({
"Prediction": predictions,
"Confidence": scores
})
if len(df.Prediction) != 0:
output = df.Prediction[0]
if df.Confidence[0] < 70:
output = "I don't know the answer to that"
else:
output = "I don't know the answer to that"
print(output, time.time() - start)`
❓ Questions and Help
I want to run pythia on my Ubuntu 18.04 machine (which I'm remotely accessing - also it has 3 2080s), but the docs are unclear on how to run pythia on a specific image in the terminal. I've been following the instruction on how to run pre-trained models - and it works, but when I try to specify a specific dataset it just sends me a directory doesn't exist error.
I'm using this command:
python tools/run.py --tasks vqa --datasets textvqa --model lorra --config configs/vqa/textvqa/lorra.yml --run_type inference --evalai_inference 1 --resume_file data/models/lorra_best.pth --num_workers 0
I've been trying to change the 'textvqa' after the '--dataset'. I'm completely new to pythia btw so I might just be clueless, but please help me solve this.