microsoft / FIBER

Coarse-to-Fine Vision-Language Pre-training with Fusion in the Backbone
MIT License
128 stars 11 forks source link

Output of pre-trained model is invalid ! #17

Open XiaokunFeng opened 1 year ago

XiaokunFeng commented 1 year ago

I I want to transfer this model to my task. However, when I conducted forward inference with the provided pre-trained model weights (fiber_refcoco.pth), I found that the model's output for a simple sample was invalid?

Specifically, I have written a simple forward inference script referring to the test_grounding_net.py (as shown below). The model can complete the forward inference process. But for a simple sample, the model outputs 0 bbox ([BoxList(num_boxes=0, image_width=1280, image_height=768, mode=xyxy)])?

May I ask if there are any steps that require special attention in forward process? Thank you very much!!!

from maskrcnn_benchmark.utils.env import setup_environment  # noqa F401 isort:skip

import argparse
from maskrcnn_benchmark.data.transforms import transforms as T
from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.modeling.detector import build_detection_model
from maskrcnn_benchmark.utils.comm import synchronize, get_rank
from maskrcnn_benchmark.utils.logger import setup_logger
from maskrcnn_benchmark.utils.miscellaneous import mkdir
import os
import datetime

import torch
import torch.distributed as dist

from maskrcnn_benchmark.data.transforms.build import build_transforms
from transformers import AutoTokenizer
from PIL import Image

def init_distributed_mode(args):
    """Initialize distributed training, if appropriate"""
    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
        args.rank = int(os.environ["RANK"])
        args.world_size = int(os.environ["WORLD_SIZE"])
        args.gpu = int(os.environ["LOCAL_RANK"])
    elif "SLURM_PROCID" in os.environ:
        args.rank = int(os.environ["SLURM_PROCID"])
        args.gpu = args.rank % torch.cuda.device_count()
    else:
        print("Not using distributed mode")
        args.distributed = False
        return

    # args.distributed = True

    torch.cuda.set_device(args.gpu)
    args.dist_backend = "nccl"
    print("| distributed init (rank {}): {}".format(args.rank, args.dist_url), flush=True)

    dist.init_process_group(
        backend=args.dist_backend,
        init_method=args.dist_url,
        world_size=args.world_size,
        rank=args.rank,
        timeout=datetime.timedelta(0, 7200),
    )
    dist.barrier()
    setup_for_distributed(args.rank == 0)

def setup_for_distributed(is_master):
    """
    This function disables printing when not in master process
    """
    import builtins as __builtin__

    builtin_print = __builtin__.print

    def print(*args, **kwargs):
        force = kwargs.pop("force", False)
        if is_master or force:
            builtin_print(*args, **kwargs)

    __builtin__.print = print

def model_infer_test():
    # par input; use default
    parser = argparse.ArgumentParser(description="PyTorch Detection to Grounding Inference")
    parser.add_argument(
        "--config-file",
        default="configs/refcoco.yaml",
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument(
        "--weight",
        default='model_weight/fiber_refcoco.pth',
        metavar="FILE",
        help="path to config file",
    )
    parser.add_argument("--local_rank", type=int, default=0)
    parser.add_argument(
        "opts", help="Modify config options using the command-line", default=None, nargs=argparse.REMAINDER
    )
    parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
    parser.add_argument("--dist-url", default="env://", help="url used to set up distributed training")

    parser.add_argument("--task_config", default=None)

    args = parser.parse_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    distributed = num_gpus > 1

    if distributed:
        init_distributed_mode(args)
        print("Passed distributed init")

    cfg.local_rank = args.local_rank
    cfg.num_gpus = num_gpus

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    # load model & checkpoint
    model = build_detection_model(cfg)
    model.to(cfg.MODEL.DEVICE)

    model_without_ddp = model

    model_weight = "model_weight/fiber_refcoco.pth"
    checkpoint = torch.load(model_weight, map_location='cpu')
    missing_keys, unexpected_keys = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
    unexpected_keys = [k for k in unexpected_keys if not (k.endswith('total_params') or k.endswith('total_ops'))]
    if len(missing_keys) > 0:
        print('Missing Keys: {}'.format(missing_keys))
    if len(unexpected_keys) > 0:
        print('Unexpected Keys: {}'.format(unexpected_keys))

    model.eval()

    # load sample
    data_dir = "resource/ZhuoQiu_video_02-Done/imgs/00002.jpg"
    txt = "the ball"

    transform = T.Compose(
        [
            # T.Resize(800,1200),
            T.ToTensor(),
            T.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225]),
        ]
    )

    img = Image.open(data_dir).convert('RGB')
    img = img.resize((1280,768))   # origin size=(1280,720), resize to (1280, 768) inorder to de divided by 32
    imgs = [transform(img)]

    imgs = torch.stack(imgs,dim=0).to(cfg.MODEL.DEVICE)

    # tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    # tokenized = tokenizer(txt, return_tensors="pt", max_length=256, truncation=True)

    # infer
    output = model(images=imgs,captions=[txt])

    print(output)

if __name__ == "__main__":
    model_infer_test()
    print("done!")

Here is the test image: 00002

And it is the model's output: ’‘’ [BoxList(num_boxes=0, image_width=1280, image_height=768, mode=xyxy)] ‘’‘