isagastiberri commented 5 years ago

How To Reproduce the Issue

what changes you made (git diff) or what code you wrote I wrote my own code for registering the dataset following the balloon example and the docs, here is how I register it:


def get_mapillary_dicts(config_dict, image_path, label_path, instance_path, panoptic_path, panoptic_dict):
#list of dictionaries with data for detectron2 
data_dicts = []
for img_name in os.listdir(image_path):
    data_dict = {}
    data_dict['file_name'] = os.path.join(image_path, img_name)
    img = Image.open(os.path.join(image_path, img_name))
    data_dict['image'] = img
    data_dict['width'], data_dict['height'] = img.size
    img_id = img_name[:-4]
    data_dict['image_id'] = img_id
    #semantic segmentation is done in label images
    data_dict['sem_seg'] = np.array(Image.open(os.path.join(label_path, img_id+'.png')))
    # data_dict['sem_seg'] = Image.open(os.path.join(label_path, img_id+'.png'))
    # data_dict['width'], data_dict['height'] = img.size
    panoptic_json_path = os.path.join(panoptic_path, 'panoptic_2018.json')
    data_dict['annotations'] = []
    for ann in panoptic_dict['annotations']:
        if ann['image_id'] == img_id:
            for segment in ann['segments_info']:
                segment_ann = {
                "bbox": segment['bbox'],
                "bbox_mode": BoxMode.XYWH_ABS,
                "category_id": segment['category_id'],
                "iscrowd": segment['iscrowd']
                }
                data_dict['annotations'].append(segment_ann)
    data_dicts.append(data_dict)
return data_dicts

def register_mapillary(root_dir): config_path = os.path.join(root_dir, 'config.json') data_name = 'mapillary-panoptic-'

read in config file

with open(config_path) as config_json:
    config_dict = json.load(config_json)
# in this example we are only interested in the labels
for d in ["training", "validation"]:
    image_path = os.path.join(root_dir, "{}/images/".format(d))
    label_path = os.path.join(root_dir,"{}/labels/".format(d))
    instance_path = os.path.join(root_dir,"{}/instances/".format(d))
    panoptic_path = os.path.join(root_dir,"{}/panoptic/".format(d))
    panoptic_json_path = os.path.join(panoptic_path, 'panoptic_2018.json')
    with open(panoptic_json_path) as panoptic_jason:
        panoptic_dict = json.load(panoptic_jason)        
    DatasetCatalog.register(data_name + d, lambda d=d: get_mapillary_dicts(config_dict, image_path, label_path, instance_path, panoptic_path, panoptic_dict))

    things = []
    for category in panoptic_dict['categories']:
        if category['isthing'] == 1:
            things = category['name']
    MetadataCatalog.get(data_name + d).set(thing_classes= things)
mapillary_metadata_train = MetadataCatalog.get(root_dir + "training")
mapillary_metadata_val = MetadataCatalog.get(root_dir + "validation")
print(DatasetCatalog._REGISTERED.keys())


An to train I also used the balloon example to create my own train_panoptic.py:

root_dir = './datasets/mapillary-vistas-panoptic/' register_mapillary(root_dir) cfg = get_cfg() cfg.merge_from_file("./configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml") cfg.DATASETS.TRAIN = ("mapillary-panoptic-training",) cfg.DATASETS.TEST = () # no metrics implemented for this dataset cfg.DATALOADER.NUM_WORKERS = 1 cfg.MODEL.WEIGHTS = "models/model_final_panoptic.pkl" # initialize from model zoo cfg.SOLVER.IMS_PER_BATCH = 1 cfg.SOLVER.BASE_LR = 0.00025 cfg.SOLVER.MAX_ITER = 300 # 300 iterations seems good enough, but you can certainly train longer cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128 # faster, and good enough for this toy dataset cfg.MODEL.ROI_HEADS.NUM_CLASSES = 66 # only has one class (ballon)

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True) trainer = DefaultTrainer(cfg) print('after trainer') trainer.resume_or_load(resume=False) print('after load') trainer.train() print('training finished')

2. what exact command you run
I run directly train_panoptic.py
3. what you observed (including the full logs):

Traceback (most recent call last): File "tools/train_panoptic.py", line 35, in trainer.train() File "/workspace/detectron2/detectron2/engine/defaults.py", line 350, in train super().train(self.start_iter, self.max_iter) File "/workspace/detectron2/detectron2/engine/train_loop.py", line 132, in train self.run_step() File "/workspace/detectron2/detectron2/engine/train_loop.py", line 212, in run_step loss_dict = self.model(data) File "/opt/conda/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in call result = self.forward(*input, **kwargs) File "/workspace/detectron2/detectron2/modeling/meta_arch/panoptic_fpn.py", line 83, in forward gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] File "/workspace/detectron2/detectron2/modeling/meta_arch/panoptic_fpn.py", line 83, in gt_sem_seg = [x["sem_seg"].to(self.device) for x in batched_inputs] AttributeError: 'numpy.ndarray' object has no attribute 'to'


## Expected behavior

It seems like it's expecting sem_seg to be a torch tensor but it's a numpy instead. In the tutorial on how to use a custom dataset it says this should be a numpy array. I assume the dataloader should handle this and convert it to torch tensor but it doesn't seem to be doing that. I use the default trainer, so I'm also using the default dataloader, which is build_detection_train_loader()

## Environment

Please paste the output of `python -m detectron2.utils.collect_env`.

sys.platform linux Python 3.6.9	Anaconda, Inc.	(default, Jul 30 2019, 19:07:31) [GCC 7.3.0] Numpy 1.17.3 Detectron2 Compiler GCC 5.4 Detectron2 CUDA Compiler 10.1 DETECTRON2_ENV_MODULE PyTorch 1.3.1 PyTorch Debug Build False torchvision 0.4.2 CUDA available True GPU 0 Quadro M6000 CUDA_HOME /usr/local/cuda NVCC Cuda compilation tools, release 10.1, V10.1.243 Pillow 6.2.1 cv2 3.4.2

PyTorch built with:

GCC 7.3
Intel(R) Math Kernel Library Version 2019.0.4 Product Build 20190411 for Intel(R) 64 architecture applications
Intel(R) MKL-DNN v0.20.5 (Git Hash 0125f28c61c1f822fd48570b4c1066f96fcb9b2e)
OpenMP 201511 (a.k.a. OpenMP 4.5)
NNPACK is enabled
CUDA Runtime 10.1
NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_37,code=compute_37
CuDNN 7.6.3
Magma 2.5.1
Build settings: BLAS=MKL, BUILD_NAMEDTENSOR=OFF, BUILD_TYPE=Release, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -fopenmp -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -O2 -fPIC -Wno-narrowing -Wall -Wextra -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Wno-stringop-overflow, DISABLE_NUMA=1, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, USE_CUDA=True, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, USE_STATIC_DISPATCH=OFF,

ppwwyyxx commented 5 years ago

data_dict['sem_seg'] should be a torch.Tensor, rather than an numpy array. I'll fix the docs. Loading all sem_seg to memory is a bad idea for memory. You can also use sem_seg_file_name.

isagastiberri commented 5 years ago

Thanks! I used sem_seg_file_name to load the semantic segmentation and it works now. I wasn't sure if you could load images as semantic segmentation or a json, that's why I loaded sem_seg directly but I see it works with images so thank you for your help!

EEEGUI commented 5 years ago

Hello @isagastiberri ~ Do you have the following problems when training mapillary? I used your code list above.

Config './configs/COCO-PanopticSegmentation/panoptic_fpn_R_50_1x.yaml' has no VERSION. Assuming it to be compatible with latest v2.
after trainer
'roi_heads.box_predictor.cls_score.weight' has shape (81, 1024) in the checkpoint but (68, 1024) in the model! Skipped.
'roi_heads.box_predictor.cls_score.bias' has shape (81,) in the checkpoint but (68,) in the model! Skipped.
'roi_heads.box_predictor.bbox_pred.weight' has shape (320, 1024) in the checkpoint but (268, 1024) in the model! Skipped.
'roi_heads.box_predictor.bbox_pred.bias' has shape (320,) in the checkpoint but (268,) in the model! Skipped.
'roi_heads.mask_head.predictor.weight' has shape (80, 256, 1, 1) in the checkpoint but (67, 256, 1, 1) in the model! Skipped.
'roi_heads.mask_head.predictor.bias' has shape (80,) in the checkpoint but (67,) in the model! Skipped.
after load
/opt/conda/conda-bld/pytorch_1570910687230/work/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [2,0,0], thread: [156,0,0] Assertion `t >= 0 && t < n_classes` failed.
/opt/conda/conda-bld/pytorch_1570910687230/work/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [2,0,0], thread: [157,0,0] Assertion `t >= 0 && t < n_classes` failed.
/opt/conda/conda-bld/pytorch_1570910687230/work/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [2,0,0], thread: [158,0,0] Assertion `t >= 0 && t < n_classes` failed.
/opt/conda/conda-bld/pytorch_1570910687230/work/aten/src/THCUNN/SpatialClassNLLCriterion.cu:104: void cunn_SpatialClassNLLCriterion_updateOutput_kernel(T *, T *, T *, long *, T *, int, int, int, int, int, long) [with T = float, AccumT = float]: block: [2,0,0], thread: [159,0,0] Assertion `t >= 0 && t < n_classes` failed.
Traceback (most recent call last):
  File "/home/lin/PycharmProjects/detectron2/tools/train_mapillary_panoptic.py", line 49, in <module>
    trainer.train()
  File "/home/lin/PycharmProjects/detectron2/detectron2/engine/defaults.py", line 329, in train
    super().train(self.start_iter, self.max_iter)
  File "/home/lin/PycharmProjects/detectron2/detectron2/engine/train_loop.py", line 132, in train
    self.run_step()
  File "/home/lin/PycharmProjects/detectron2/detectron2/engine/train_loop.py", line 212, in run_step
    loss_dict = self.model(data)
  File "/home/lin/Software/anaconda3/envs/psnet/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/lin/PycharmProjects/detectron2/detectron2/modeling/meta_arch/panoptic_fpn.py", line 96, in forward
    proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
  File "/home/lin/Software/anaconda3/envs/psnet/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/lin/PycharmProjects/detectron2/detectron2/modeling/proposal_generator/rpn.py", line 143, in forward
    anchors = self.anchor_generator(features)
  File "/home/lin/Software/anaconda3/envs/psnet/lib/python3.6/site-packages/torch/nn/modules/module.py", line 541, in __call__
    result = self.forward(*input, **kwargs)
  File "/home/lin/PycharmProjects/detectron2/detectron2/modeling/anchor_generator.py", line 181, in forward
    anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
  File "/home/lin/PycharmProjects/detectron2/detectron2/modeling/anchor_generator.py", line 124, in grid_anchors
    shift_x, shift_y = _create_grid_offsets(size, stride, base_anchors.device)
  File "/home/lin/PycharmProjects/detectron2/detectron2/modeling/anchor_generator.py", line 43, in _create_grid_offsets
    shifts_x = torch.arange(0, grid_width * stride, step=stride, dtype=torch.float32, device=device)
RuntimeError: tabulate: failed to synchronize: cudaErrorAssert: device-side assert triggered

isagastiberri commented 5 years ago

@EEEGUI yes! I have the same problem I was checking everything in my code first but I was about to open a new issue because I don't think that assertion should break the code. Assertion t >= 0 && t < n_classes is not correct, as t = n_classes should be the background class

facebookresearch / detectron2

sem_seg has no attribute .to when training for panoptic #295

How To Reproduce the Issue

read in config file