michelewang commented 3 years ago

Instructions To Reproduce the 🐛 Bug:

Full runnable code or full changes you made: (see main method and get_building_dicts for the changes I made from the original deeplab training script)
```
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
```

""" DeepLab Training Script.

This script is a simplified version of the training script in detectron2/tools. """

import os import torch

import detectron2.data.transforms as T import detectron2.utils.comm as comm from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg from detectron2.data import DatasetMapper, MetadataCatalog, build_detection_train_loader from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch from detectron2.evaluation import CityscapesSemSegEvaluator, DatasetEvaluators, SemSegEvaluator from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler

import numpy as np import json import matplotlib.pyplot as plt import cv2 import random import glob from datetime import datetime import pickle from pathlib import Path from tqdm import tqdm

from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.structures import BoxMode from detectron2.utils.visualizer import ColorMode

import some common detectron2 utilities

from detectron2 import model_zoo from detectron2.engine import DefaultPredictor from detectron2.config import get_cfg from detectron2.utils.visualizer import Visualizer

def build_sem_seg_train_aug(cfg): augs = [ T.ResizeShortestEdge( cfg.INPUT.MIN_SIZE_TRAIN, cfg.INPUT.MAX_SIZE_TRAIN, cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING ) ] if cfg.INPUT.CROP.ENABLED: augs.append( T.RandomCrop_CategoryAreaConstraint( cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE, cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, ) ) augs.append(T.RandomFlip()) return augs

class Trainer(DefaultTrainer): """ We use the "DefaultTrainer" which contains a number pre-defined logic for standard training workflow. They may not work for you, especially if you are working on a new research project. In that case you can use the cleaner "SimpleTrainer", or write your own training loop. """

@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
    """
    Create evaluator(s) for a given dataset.
    This uses the special metadata "evaluator_type" associated with each builtin dataset.
    For your own dataset, you can simply create an evaluator manually in your
    script and do not have to worry about the hacky if-else logic here.
    """
    if output_folder is None:
        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
    evaluator_list = []
    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
    if evaluator_type == "sem_seg":
        return SemSegEvaluator(
            dataset_name,
            distributed=True,
            output_dir=output_folder,
        )
    if evaluator_type == "cityscapes_sem_seg":
        assert (
            torch.cuda.device_count() >= comm.get_rank()
        ), "CityscapesEvaluator currently do not work with multiple machines."
        return CityscapesSemSegEvaluator(dataset_name)
    if len(evaluator_list) == 0:
        raise NotImplementedError(
            "no Evaluator for the dataset {} with the type {}".format(
                dataset_name, evaluator_type
            )
        )
    if len(evaluator_list) == 1:
        return evaluator_list[0]
    return DatasetEvaluators(evaluator_list)

@classmethod
def build_train_loader(cls, cfg):
    if "SemanticSegmentor" in cfg.MODEL.META_ARCHITECTURE:
        mapper = DatasetMapper(cfg, is_train=True, augmentations=build_sem_seg_train_aug(cfg))
    else:
        mapper = None
    return build_detection_train_loader(cfg, mapper=mapper)

@classmethod
def build_lr_scheduler(cls, cfg, optimizer):
    """
    It now calls :func:`detectron2.solver.build_lr_scheduler`.
    Overwrite it if you'd like a different scheduler.
    """
    return build_lr_scheduler(cfg, optimizer)

def setup(args): """ Create configs and perform basic setups. """ cfg = get_cfg() add_deeplab_config(cfg) cfg.merge_from_file(args.config_file) cfg.merge_from_list(args.opts) cfg.freeze() default_setup(cfg, args) return cfg

I CHANGED THIS PART

def get_building_dicts(img_dir): """This function loads the JSON file created with the annotator and converts it to the detectron2 metadata specifications. """ img_links = glob.glob(img_dir+"labels/*.json")

only keep the images that include post

img_anns = list(filter(lambda x: "post" in x, img_links)) 

dataset_dicts = []
# loop through the entries in the JSON file
for idx, single in enumerate(img_anns):
    v = json.load(open(single))
    record = {}
    # add file_name, image_id, height and width information to the records
    filename = os.path.join(img_dir, "images/", v["metadata"]["img_name"])
    height, width = (v["metadata"]["height"], v["metadata"]["width"])

    record["file_name"] = filename
    record["image_id"] = idx
    record["height"] = height
    record["width"] = width
    record["sem_seg_file_name"] = img_dir+"bin_masks/" + v["metadata"]["img_name"]
    dataset_dicts.append(record)

return dataset_dicts

def main(args):

I CHANGED THIS PART, REGISTERING MY DATASETS HERE

# set up 
img_anns = glob.glob('/n/tambe_lab/Users/michelewang/train/'+"labels/*.json")
# the data has to be registered within detectron2, once for the train and once for
# the val data

for d in ["train", "test"]:
    DatasetCatalog.register(
        "xbddata_" + d, lambda d=d: get_building_dicts("/n/tambe_lab/Users/michelewang/" + d),
    )
    MetadataCatalog.get("xbddata_"+d).thing_classes = ["0","1","2"]

buildingmetadata = MetadataCatalog.get("xbddata" + d)

print("Dataset Catalog", DatasetCatalog.list())
print("XBDDATA_TRAIN", DatasetCatalog.get("xbddata_train"))
xbdtrain_metadata = MetadataCatalog.get("xbddata_train")

cfg = setup(args)

if args.eval_only:
    model = Trainer.build_model(cfg)
    DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
        cfg.MODEL.WEIGHTS, resume=args.resume
    )
    res = Trainer.test(cfg, model)
    return res

trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()

if name == "main": args = default_argument_parser().parse_args() print("Command Line Args:", args) launch( main, args.num_gpus, num_machines=args.num_machines, machine_rank=args.machine_rank, dist_url=args.dist_url, args=(args,), )

2. What exact command you run:
cd /n/home07/michelewang/thesis/detectron2/projects/DeepLab
python train_net_xbd.py --config-file configs/xBD-configs/base-deeplabv3.yaml --num-gpus 4

Relevant Configs: 

base-deeplabv3.yaml is:

BASE: base.yaml MODEL: WEIGHTS: "detectron2://DeepLab/R-103.pkl" PIXEL_MEAN: [123.675, 116.280, 103.530] PIXEL_STD: [58.395, 57.120, 57.375] BACKBONE: NAME: "build_resnet_deeplab_backbone" RESNETS: DEPTH: 101 NORM: "SyncBN" OUT_FEATURES: ["res2", "res5"] RES5_MULTI_GRID: [1, 2, 4] STEM_TYPE: "deeplab" STEM_OUT_CHANNELS: 128 STRIDE_IN_1X1: False SEM_SEG_HEAD: NAME: "DeepLabV3PlusHead" IN_FEATURES: ["res2", "res5"] PROJECT_FEATURES: ["res2"] PROJECT_CHANNELS: [48] NORM: "SyncBN" COMMON_STRIDE: 4 INPUT: FORMAT: "RGB"


base.yaml is:

BASE: "../../../../configs/Base-RCNN-DilatedC5.yaml" MODEL: META_ARCHITECTURE: "SemanticSegmentor" BACKBONE: FREEZE_AT: 0 SEM_SEG_HEAD: NAME: "DeepLabV3Head" IN_FEATURES: ["res5"] ASPP_CHANNELS: 256 ASPP_DILATIONS: [6, 12, 18] ASPP_DROPOUT: 0.1 CONVS_DIM: 256 COMMON_STRIDE: 16 NUM_CLASSES: 19 LOSS_TYPE: "hard_pixel_mining" DATASETS: TRAIN: ("xbddata_train",) TEST: ("xbddata_test",) SOLVER: BASE_LR: 0.01 MAX_ITER: 90000 LR_SCHEDULER_NAME: "WarmupPolyLR" IMS_PER_BATCH: 16 INPUT: MIN_SIZE_TRAIN: (1024,) MIN_SIZE_TRAIN_SAMPLING: "choice" MIN_SIZE_TEST: 1024 MAX_SIZE_TRAIN: 1024 MAX_SIZE_TEST: 1024 CROP: ENABLED: True TYPE: "absolute" SIZE: (512, 1024) SINGLE_CATEGORY_MAX_AREA: 1.0 DATALOADER: NUM_WORKERS: 10

3. __Full logs__ or other relevant observations:

cuobjdump info : File '/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/_C.cpython-38-x86_64-linux-gnu.so' does not contain device code cuobjdump info : File '/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/_C.cpython-38-x86_64-linux-gnu.so' does not contain device code cuobjdump info : File '/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/_C.cpython-38-x86_64-linux-gnu.so' does not contain device code cuobjdump info : File '/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/_C.cpython-38-x86_64-linux-gnu.so' does not contain device code Traceback (most recent call last): File "train_net_xbd.py", line 194, in launch( File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/engine/launch.py", line 55, in launch mp.spawn( File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 199, in spawn return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes while not context.join(): File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 118, in join raise Exception(msg) Exception:

-- Process 2 terminated with the following error: Traceback (most recent call last): File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap fn(i, args) File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/engine/launch.py", line 94, in _distributed_worker main_func(args) File "/n/home07/michelewang/thesis/detectron2/projects/DeepLab/train_net_xbd.py", line 186, in main trainer = Trainer(cfg) File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/engine/defaults.py", line 312, in init data_loader = self.build_train_loader(cfg) File "/n/home07/michelewang/thesis/detectron2/projects/DeepLab/train_net_xbd.py", line 109, in build_train_loader return build_detection_train_loader(cfg, mapper=mapper) File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/config/config.py", line 201, in wrapped explicit_args = _get_args_from_config(from_config, *args, *kwargs) File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/config/config.py", line 238, in _get_args_from_config ret = from_config_func(args, kwargs) File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/data/build.py", line 310, in _train_loader_from_config dataset = get_detection_dataset_dicts( File "/n/home07/michelewang/.conda/envs/active/lib/python3.8/site-packages/detectron2/data/build.py", line 231, in get_detection_dataset_dicts assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) AssertionError: Dataset 'xbddata_train' is empty!**


4. please simplify the steps as much as possible so they do not require additional resources to
     run, such as a private dataset.

## Expected behavior:
I would expect to see the model train but instead I get an error that both of the datasets i tried to register, xbddata_train and xbddata_test are both empty! 

## Environment:

Provide your environment information using the following command:

github-actions[bot] commented 3 years ago

You've chosen to report an unexpected problem or bug. Unless you already know the root cause of it, please include details about it by filling the issue template. The following information is missing: "Your Environment";

ppwwyyxx commented 3 years ago

Does

print("XBDDATA_TRAIN", DatasetCatalog.get("xbddata_train"))

produce empty data? It seems to suggest that the custom data loading function in your code returns empty data