facebookresearch / mmf

A modular framework for vision & language multimodal research from Facebook AI Research (FAIR)
https://mmf.sh/
Other
5.51k stars 938 forks source link

How do you actually run pythia on an image without the notebook? #204

Closed inafi closed 4 years ago

inafi commented 4 years ago

❓ Questions and Help

I want to run pythia on my Ubuntu 18.04 machine (which I'm remotely accessing - also it has 3 2080s), but the docs are unclear on how to run pythia on a specific image in the terminal. I've been following the instruction on how to run pre-trained models - and it works, but when I try to specify a specific dataset it just sends me a directory doesn't exist error.

I'm using this command: python tools/run.py --tasks vqa --datasets textvqa --model lorra --config configs/vqa/textvqa/lorra.yml --run_type inference --evalai_inference 1 --resume_file data/models/lorra_best.pth --num_workers 0

I've been trying to change the 'textvqa' after the '--dataset'. I'm completely new to pythia btw so I might just be clueless, but please help me solve this.

apsdehal commented 4 years ago

Can you provide exact log/screenshot of the error? It is probably happening because you haven't downloaded textvqa's imdb and features as mentioned in getting started.

inafi commented 4 years ago

The custom images I want to run are in the "~/pythia/images" folder (main pythia directory). Here's the error: (base) inafi@cm:~/pythia$ python tools/run.py --tasks vqa --datasets images/person.jpg --model lorra --config configs/vqa/textvqa/lorra.yml --run_type inference --evalai_inference 1 --resume_file data/models/lorra_best.pth --num_workers 0 Logging to: ./save/vqa_images/person.jpg_lorra/logs/vqa_images/person.jpg_lorra_2019-12-06T08:14:05.log Traceback (most recent call last): File "tools/run.py", line 94, in run() File "tools/run.py", line 82, in run trainer.load() File "/home/inafi/pythia/pythia/trainers/base_trainer.py", line 37, in load self.writer = Logger(self.config) File "/home/inafi/pythia/pythia/utils/logger.py", line 64, in init channel = logging.FileHandler(filename=self.log_filename, mode="a") File "/home/inafi/miniconda3/lib/python3.6/logging/init.py", line 1032, in init StreamHandler.init(self, self._open()) File "/home/inafi/miniconda3/lib/python3.6/logging/init.py", line 1061, in _open return open(self.baseFilename, self.mode, encoding=self.encoding) FileNotFoundError: [Errno 2] No such file or directory: '/home/inafi/pythia/save/vqa_images/person.jpg_lorra/logs/vqa_images/person.jpg_lorra_2019-12-06T08:14:05.log'

apsdehal commented 4 years ago

This is not the way the direct inference is meant to use. If you are looking to use a setup like colab demo, I would suggest you to copy the colab's code to a script and add your own arguments on top of it with ArgumentParser for the fields that we specify in colab via text fields.

More details on why what you are doing won't work:

In inference mode inside the pythia repo, the dataset passed for --datasets expects a dataset to be registered with pythia. See how to register a new dataset in pythia at https://learnpythia.readthedocs.io/en/latest/tutorials/dataset.html. Then, the pythia model expects features of the image to be passed back from the dataset. Thus, passing the image directly won't work.

inafi commented 4 years ago

I successfully converted the notebook to a py file (it works!). Here's the code:

from pythia.common.sample import Sample, SampleList
from pythia.common.registry import registry
from pythia.models.pythia import Pythia
from pythia.tasks.processors import VocabProcessor, VQAAnswerProcessor
from pythia.utils.configuration import ConfigNode
from maskrcnn_benchmark.utils.model_serialization import load_state_dict
from maskrcnn_benchmark.structures.image_list import to_image_list
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.layers import nms
from maskrcnn_benchmark.config import cfg
from io import BytesIO
from ipywidgets import widgets, Layout
from IPython.display import display, HTML, clear_output
from PIL import Image
import torchvision.transforms as transforms
import torchvision.models as models
import pandas as pd
import torch.nn.functional as F
import gc
import numpy as np
import requests
import torch
import cv2
import yaml
import sys
import time

start = time.time()
sys.path.append('content/pythia')
sys.path.append('content/vqa-maskrcnn-benchmark')`

class PythiaDemo:
    TARGET_IMAGE_SIZE = [448, 448]
    CHANNEL_MEAN = [0.485, 0.456, 0.406]
    CHANNEL_STD = [0.229, 0.224, 0.225]

    def __init__(self):
        self._init_processors()
        self.pythia_model = self._build_pythia_model()
        self.detection_model = self._build_detection_model()
        self.resnet_model = self._build_resnet_model()

    def _init_processors(self):
        with open("content/model_data/pythia.yaml") as f:
            config = yaml.load(f)

        config = ConfigNode(config)
        # Remove warning
        config.training_parameters.evalai_inference = True
        registry.register("config", config)

        self.config = config

        vqa_config = config.task_attributes.vqa.dataset_attributes.vqa2
        text_processor_config = vqa_config.processors.text_processor
        answer_processor_config = vqa_config.processors.answer_processor

        text_processor_config.params.vocab.vocab_file = "content/model_data/vocabulary_100k.txt"
        answer_processor_config.params.vocab_file = "content/model_data/answers_vqa.txt"
        # Add preprocessor as that will needed when we are getting questions from user
        self.text_processor = VocabProcessor(text_processor_config.params)
        self.answer_processor = VQAAnswerProcessor(
            answer_processor_config.params)

        registry.register("vqa2_text_processor", self.text_processor)
        registry.register("vqa2_answer_processor", self.answer_processor)
        registry.register("vqa2_num_final_outputs",
                          self.answer_processor.get_vocab_size())

    def _build_pythia_model(self):
        state_dict = torch.load('content/model_data/pythia.pth')
        model_config = self.config.model_attributes.pythia
        model_config.model_data_dir = "content/"
        model = Pythia(model_config)
        model.build()
        model.init_losses_and_metrics()
        if list(state_dict.keys())[0].startswith('module') and \
           not hasattr(model, 'module'):
            state_dict = self._multi_gpu_state_to_single(state_dict)

        model.load_state_dict(state_dict)
        model.to("cuda")
        model.eval()
        return model

    def _build_resnet_model(self):
        self.data_transforms = transforms.Compose([
            transforms.Resize(self.TARGET_IMAGE_SIZE),
            transforms.ToTensor(),
            transforms.Normalize(self.CHANNEL_MEAN, self.CHANNEL_STD),
        ])
        resnet152 = models.resnet152(pretrained=True)
        resnet152.eval()
        modules = list(resnet152.children())[:-2]
        self.resnet152_model = torch.nn.Sequential(*modules)
        self.resnet152_model.to("cuda")

    def _multi_gpu_state_to_single(self, state_dict):
        new_sd = {}
        for k, v in state_dict.items():
            if not k.startswith('module.'):
                raise TypeError("Not a multiple GPU state of dict")
            k1 = k[7:]
            new_sd[k1] = v
        return new_sd

    def predict(self, url, question):
        with torch.no_grad():
            detectron_features = self.get_detectron_features(url)
            resnet_features = self.get_resnet_features(url)

            sample = Sample()

            processed_text = self.text_processor({"text": question})
            sample.text = processed_text["text"]
            sample.text_len = len(processed_text["tokens"])

            sample.image_feature_0 = detectron_features
            sample.image_info_0 = Sample({
                "max_features": torch.tensor(100, dtype=torch.long)
            })

            sample.image_feature_1 = resnet_features

            sample_list = SampleList([sample])
            sample_list = sample_list.to("cuda")

            scores = self.pythia_model(sample_list)["scores"]
            scores = torch.nn.functional.softmax(scores, dim=1)
            actual, indices = scores.topk(5, dim=1)

            top_indices = indices[0]
            top_scores = actual[0]

            probs = []
            answers = []

            for idx, score in enumerate(top_scores):
                probs.append(score.item())
                answers.append(
                    self.answer_processor.idx2word(top_indices[idx].item())
                )

        gc.collect()
        torch.cuda.empty_cache()
        return probs, answers

    def _build_detection_model(self):

        cfg.merge_from_file('content/model_data/detectron_model.yaml')
        cfg.freeze()

        model = build_detection_model(cfg)
        checkpoint = torch.load('content/model_data/detectron_model.pth',
                                map_location=torch.device("cpu"))

        load_state_dict(model, checkpoint.pop("model"))

        model.to("cuda")
        model.eval()
        return model

    def get_actual_image(self, image_path):
        if image_path.startswith('http'):
            path = requests.get(image_path, stream=True).raw
        else:
            path = image_path
        return path

    def _image_transform(self, image_path):
        path = self.get_actual_image(image_path)
        img = Image.open(path)
        im = np.array(img).astype(np.float32)
        im = im[:, :, ::-1]
        im -= np.array([102.9801, 115.9465, 122.7717])
        im_shape = im.shape
        im_size_min = np.min(im_shape[0:2])
        im_size_max = np.max(im_shape[0:2])
        im_scale = float(800) / float(im_size_min)
        # Prevent the biggest axis from being more than max_size
        if np.round(im_scale * im_size_max) > 1333:
            im_scale = float(1333) / float(im_size_max)
        im = cv2.resize(
            im,
            None,
            None,
            fx=im_scale,
            fy=im_scale,
            interpolation=cv2.INTER_LINEAR
        )
        img = torch.from_numpy(im).permute(2, 0, 1)
        return img, im_scale

    def _process_feature_extraction(self, output,
                                    im_scales,
                                    feat_name='fc6',
                                    conf_thresh=0.2):
        batch_size = len(output[0]["proposals"])
        n_boxes_per_image = [len(_) for _ in output[0]["proposals"]]
        score_list = output[0]["scores"].split(n_boxes_per_image)
        score_list = [torch.nn.functional.softmax(x, -1) for x in score_list]
        feats = output[0][feat_name].split(n_boxes_per_image)
        cur_device = score_list[0].device

        feat_list = []

        for i in range(batch_size):
            dets = output[0]["proposals"][i].bbox / im_scales[i]
            scores = score_list[i]
            max_conf = torch.zeros((scores.shape[0])).to(cur_device)
            for cls_ind in range(1, scores.shape[1]):
                cls_scores = scores[:, cls_ind]
                keep = nms(dets, cls_scores, 0.5)
                max_conf[keep] = torch.where(cls_scores[keep] > max_conf[keep],
                                             cls_scores[keep],
                                             max_conf[keep])

            keep_boxes = torch.argsort(max_conf, descending=True)[:100]
            feat_list.append(feats[i][keep_boxes])
        return feat_list

    def masked_unk_softmax(self, x, dim, mask_idx):
        x1 = F.softmax(x, dim=dim)
        x1[:, mask_idx] = 0
        x1_sum = torch.sum(x1, dim=1, keepdim=True)
        y = x1 / x1_sum
        return y

    def get_resnet_features(self, image_path):
        path = self.get_actual_image(image_path)
        img = Image.open(path).convert("RGB")
        img_transform = self.data_transforms(img)
        if img_transform.shape[0] == 1:
            img_transform = img_transform.expand(3, -1, -1)
        img_transform = img_transform.unsqueeze(0).to("cuda")
        features = self.resnet152_model(img_transform).permute(0, 2, 3, 1)
        features = features.view(196, 2048)
        return features

    def get_detectron_features(self, image_path):
        im, im_scale = self._image_transform(image_path)
        img_tensor, im_scales = [im], [im_scale]
        current_img_list = to_image_list(img_tensor, size_divisible=32)
        current_img_list = current_img_list.to('cuda')
        with torch.no_grad():
            output = self.detection_model(current_img_list)
        feat_list = self._process_feature_extraction(output, im_scales, 'fc6', 0.2)
        return feat_list[0]

demo = PythiaDemo()

#image_text = "http://images.cocodataset.org/train2017/000000505539.jpg"
image_text = "giraffe.jpg"
question_text = str(cli.get("pythia-question"))
clear_output()
image_path = demo.get_actual_image(image_text)
image = Image.open(image_path)

scores, predictions = demo.predict(image_text, question_text)
scores = [score * 100 for score in scores]
df = pd.DataFrame({
    "Prediction": predictions,
    "Confidence": scores
})

if len(df.Prediction) != 0:
    output = df.Prediction[0]
    if df.Confidence[0] < 70:
        output = "I don't know the answer to that"
else:
    output = "I don't know the answer to that"
print(output, time.time() - start)`