jasonchenPJ commented 2 years ago

I add the no object image to train PointRend model, but sometime i get the runtimeError even set FILTER_EMPTY_ANNOTATIONS=False. I guess this error occurs because all image with no object are sampled in one batch. So i build the full no object dataset to verify this concept and got the following results.

The following link is my dataset: https://drive.google.com/file/d/1sJlKhd6InYEozaN4F2mF7LxeRRPaSV8R/view?usp=sharing

Is there a solution to the problem?

Instructions To Reproduce the Issue:

This is a PointRend variant of Detectron2, using a custom dataset and adding image augmentation.

Full runnable code or full changes you made:


import os
import cv2
import detectron2.data.transforms as T
import detectron2.utils.comm as comm
import pycocotools
import torch

from detectron2.checkpoint import DetectionCheckpointer from detectron2.config import get_cfg from detectron2.data import DatasetCatalog, MetadataCatalog from detectron2.data import DatasetMapper, build_detection_train_loader from detectron2.data.datasets.coco import load_coco_json from detectron2.engine import default_argument_parser, default_setup, launch from detectron2.engine import DefaultTrainer, BestCheckpointer from detectron2.evaluation import COCOEvaluator, DatasetEvaluators, verify_results from detectron2.projects.point_rend import ColorAugSSDTransform, add_pointrend_config

Segmentation Model

config_file = './configs/pointrend_rcnn_R_50_FPN_3x_coco.yaml' weight_file = './model_final_rcnn_R_50_FPN_3x.pkl'

Dataset

cls_names = ['background', 'HPDF01', 'HAPP01'] train_path = './data/page_train' valid_path = './data/page_valid' train_json = os.path.join(train_path, 'annotations.json') valid_json = os.path.join(valid_path, 'annotations.json') num_train_data = 36

Result folder

output_dir = './ckpt'

def setup(args):

load based config

cfg = get_cfg()
# extend config parameter for PointRend NN
add_pointrend_config(cfg)
# update config setting for PointRend NN
cfg.merge_from_file(config_file)
cfg.merge_from_list(['MODEL.WEIGHTS', weight_file])

# update NN setting
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3
cfg.MODEL.POINT_HEAD.NUM_CLASSES = 3

# update data loader setting
cfg.DATASETS.TRAIN = ("page_train",)
cfg.DATASETS.TEST = ("page_valid",)
cfg.DATALOADER.NUM_WORKERS = 4
cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS = False

cfg.INPUT.MAX_SIZE_TRAIN = 1024
cfg.INPUT.MIN_SIZE_TRAIN = (640, 1024)
cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING = "range"
cfg.INPUT.MAX_SIZE_TEST = 1024
cfg.INPUT.MIN_SIZE_TEST = 1024

cfg.INPUT.RANDOM_FLIP = "none"
cfg.INPUT.CROP.ENABLED = False
cfg.INPUT.CROP.TYPE = "relative_range"
cfg.INPUT.CROP.SIZE = [0.8, 0.8]
cfg.INPUT.COLOR_AUG_SSD = True

# update hyper-parameter
cfg.SOLVER.IMS_PER_BATCH = 16
ITERS_PER_EPOCH = int(num_train_data/cfg.SOLVER.IMS_PER_BATCH)
MAX_ITER = ITERS_PER_EPOCH*500
cfg.SOLVER.MAX_ITER = MAX_ITER

cfg.SOLVER.BASE_LR = 0.02
cfg.SOLVER.MOMENTUM = 0.9
cfg.SOLVER.STEPS = (int(MAX_ITER*0.6), int(MAX_ITER*0.8))
cfg.SOLVER.GAMMA = 0.1
cfg.SOLVER.WARMUP_ITERS = int(ITERS_PER_EPOCH*0.05)

# update evaluation/saving setting
#cfg.SOLVER.CHECKPOINT_PERIOD = ITERS_PER_EPOCH
cfg.TEST.EVAL_PERIOD = ITERS_PER_EPOCH
cfg.OUTPUT_DIR = output_dir

cfg.freeze()
default_setup(cfg, args)
return cfg

def plain_register_dataset(): DatasetCatalog.register("page_train", lambda: load_coco_json(train_json, train_path)) MetadataCatalog.get('page_train').set(thing_classes=cls_names, evaluator_type='coco', json_file=train_json, image_root=train_path) DatasetCatalog.register("page_valid", lambda: load_coco_json(valid_json, valid_path)) MetadataCatalog.get('page_valid').set(thing_classes=cls_names, evaluator_type='coco', json_file=valid_json, image_root=valid_path)

def build_trainAug(cfg): augs = [T.RandomRotation(angle=[0,90,180,270], sample_style="choice")] if cfg.INPUT.CROP.ENABLED: augs.append( T.RandomCrop( crop_type=cfg.INPUT.CROP.TYPE, crop_size=cfg.INPUT.CROP.SIZE ) ) augs.append(T.ResizeShortestEdge(short_edge_length=cfg.INPUT.MIN_SIZE_TRAIN, max_size=cfg.INPUT.MAX_SIZE_TRAIN, sample_style=cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING)) if cfg.INPUT.COLOR_AUG_SSD: augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))

return augs

class Trainer(DefaultTrainer): """ We use the "DefaultTrainer" which contains a number pre-defined logic for standard training workflow. They may not work for you, especially if you are working on a new research project. In that case you can use the cleaner "SimpleTrainer", or write your own training loop. """

@classmethod
def build_evaluator(cls, cfg, dataset_name, output_folder=None):
    """
    Create evaluator(s) for a given dataset.
    This uses the special metadata "evaluator_type" associated with each builtin dataset.
    For your own dataset, you can simply create an evaluator manually in your
    script and do not have to worry about the hacky if-else logic here.
    """
    if output_folder is None:
        output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
    evaluator_list = []
    evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type

    if evaluator_type == "coco":
        evaluator_list.append(COCOEvaluator(dataset_name, output_dir=output_folder))

    if len(evaluator_list) == 0:
        raise NotImplementedError(
            "no Evaluator for the dataset {} with the type {}".format(
                dataset_name, evaluator_type
            )
        )
    elif len(evaluator_list) == 1:
        return evaluator_list[0]
    return DatasetEvaluators(evaluator_list)

@classmethod
def build_train_loader(cls, cfg):
    mapper = DatasetMapper(cfg, is_train=True, augmentations=build_trainAug(cfg))

    train_dataLoader = build_detection_train_loader(cfg, mapper=mapper)
    #dump_augImage(cfg, train_dataLoader)
    return train_dataLoader

def build_hooks(self):
    cfg = self.cfg.clone()
    hooks = super().build_hooks()
    if comm.is_main_process():
        hooks.append(
            BestCheckpointer(
                eval_period=cfg.TEST.EVAL_PERIOD,
                checkpointer=DetectionCheckpointer(self.model, cfg.OUTPUT_DIR),
                val_metric="segm/AP",
                mode="max"
            )
        )
    return hooks

def main(args): cfg = setup(args) plain_register_dataset()

if args.eval_only:
    model = Trainer.build_model(cfg)
    DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
        cfg.MODEL.WEIGHTS, resume=args.resume
    )
    res = Trainer.test(cfg, model)
    if comm.is_main_process():
        verify_results(cfg, res)
    return res

trainer = Trainer(cfg)
trainer.resume_or_load(resume=args.resume)
return trainer.train()

if name == "main": args = default_argument_parser().parse_args() print("Command Line Args:", args) launch( main, args.num_gpus, num_machines=args.num_machines, machine_rank=args.machine_rank, dist_url=args.dist_url, args=(args,), )

2. What exact command you run:

CUDA_VISIBLE_DEVICES=0,1 python pointRend_train.py --num-gpus=2 --num-machines=1

3. __Full logs__ or other relevant observations:

[07/07 09:50:28] detectron2 INFO: Command line arguments: Namespace(config_file='', dist_url='tcp://127.0.0.1:51156', eval_only=False, machine_rank=0, num_gpus=2, num_machines=1, opts=[], resume=False) [07/07 09:50:28] detectron2 INFO: Running with full config: CUDNN_BENCHMARK: false DATALOADER: ASPECT_RATIO_GROUPING: true FILTER_EMPTY_ANNOTATIONS: false NUM_WORKERS: 4 REPEAT_THRESHOLD: 0.0 SAMPLER_TRAIN: TrainingSampler DATASETS: PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000 PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000 PROPOSAL_FILES_TEST: [] PROPOSAL_FILES_TRAIN: [] TEST:

page_valid TRAIN:
page_train GLOBAL: HACK: 1.0 INPUT: COLOR_AUG_SSD: true CROP: ENABLED: false SINGLE_CATEGORY_MAX_AREA: 1.0 SIZE:
- 0.8
- 0.8 TYPE: relative_range FORMAT: BGR MASK_FORMAT: bitmask MAX_SIZE_TEST: 1024 MAX_SIZE_TRAIN: 1024 MIN_SIZE_TEST: 1024 MIN_SIZE_TRAIN:
640
1024 MIN_SIZE_TRAIN_SAMPLING: range RANDOM_FLIP: none MODEL: ANCHOR_GENERATOR: ANGLES:
- - -90
  - 0
  - 90 ASPECT_RATIOS:
- - 0.5
  - 1.0
  - 2.0 NAME: DefaultAnchorGenerator OFFSET: 0.0 SIZES:
- - 32
- - 64
- - 128
- - 256
- - 512 BACKBONE: FREEZE_AT: 2 NAME: build_resnet_fpn_backbone DEVICE: cuda FPN: FUSE_TYPE: sum IN_FEATURES:
- res2
- res3
- res4
- res5 NORM: '' OUT_CHANNELS: 256 IMPLICIT_POINTREND: IMAGE_FEATURE_ENABLED: true PARAMS_L2_REGULARIZER: 1.0e-05 POS_ENC_ENABLED: true KEYPOINT_ON: false LOAD_PROPOSALS: false MASK_ON: true META_ARCHITECTURE: GeneralizedRCNN PANOPTIC_FPN: COMBINE: ENABLED: true INSTANCES_CONFIDENCE_THRESH: 0.5 OVERLAP_THRESH: 0.5 STUFF_AREA_LIMIT: 4096 INSTANCE_LOSS_WEIGHT: 1.0 PIXEL_MEAN:
103.53
116.28
123.675 PIXEL_STD:
1.0
1.0
1.0 POINT_HEAD: CLS_AGNOSTIC_MASK: false COARSE_PRED_EACH_LAYER: true COARSE_SEM_SEG_HEAD_NAME: SemSegFPNHead FC_DIM: 256 IMPORTANCE_SAMPLE_RATIO: 0.75 IN_FEATURES:
- p2 NAME: StandardPointHead NUM_CLASSES: 3 NUM_FC: 3 OVERSAMPLE_RATIO: 3 SUBDIVISION_NUM_POINTS: 784 SUBDIVISION_STEPS: 5 TRAIN_NUM_POINTS: 196 PROPOSAL_GENERATOR: MIN_SIZE: 0 NAME: RPN RESNETS: DEFORM_MODULATED: false DEFORM_NUM_GROUPS: 1 DEFORM_ON_PER_STAGE:
- false
- false
- false
- false DEPTH: 50 NORM: FrozenBN NUM_GROUPS: 1 OUT_FEATURES:
- res2
- res3
- res4
- res5 RES2_OUT_CHANNELS: 256 RES5_DILATION: 1 STEM_OUT_CHANNELS: 64 STRIDE_IN_1X1: true WIDTH_PER_GROUP: 64 RETINANET: BBOX_REG_LOSS_TYPE: smooth_l1 BBOX_REG_WEIGHTS: &id001
- 1.0
- 1.0
- 1.0
- 1.0 FOCAL_LOSS_ALPHA: 0.25 FOCAL_LOSS_GAMMA: 2.0 IN_FEATURES:
- p3
- p4
- p5
- p6
- p7 IOU_LABELS:
- 0
- -1
- 1 IOU_THRESHOLDS:
- 0.4
- 0.5 NMS_THRESH_TEST: 0.5 NORM: '' NUM_CLASSES: 80 NUM_CONVS: 4 PRIOR_PROB: 0.01 SCORE_THRESH_TEST: 0.05 SMOOTH_L1_LOSS_BETA: 0.1 TOPK_CANDIDATES_TEST: 1000 ROI_BOX_CASCADE_HEAD: BBOX_REG_WEIGHTS:
- - 10.0
  - 10.0
  - 5.0
  - 5.0
- - 20.0
  - 20.0
  - 10.0
  - 10.0
- - 30.0
  - 30.0
  - 15.0
  - 15.0 IOUS:
- 0.5
- 0.6
- 0.7 ROI_BOX_HEAD: BBOX_REG_LOSS_TYPE: smooth_l1 BBOX_REG_LOSS_WEIGHT: 1.0 BBOX_REG_WEIGHTS:
- 10.0
- 10.0
- 5.0
- 5.0 CLS_AGNOSTIC_BBOX_REG: false CONV_DIM: 256 FC_DIM: 1024 NAME: FastRCNNConvFCHead NORM: '' NUM_CONV: 0 NUM_FC: 2 POOLER_RESOLUTION: 7 POOLER_SAMPLING_RATIO: 0 POOLER_TYPE: ROIAlignV2 SMOOTH_L1_BETA: 0.0 TRAIN_ON_PRED_BOXES: true ROI_HEADS: BATCH_SIZE_PER_IMAGE: 512 IN_FEATURES:
- p2
- p3
- p4
- p5 IOU_LABELS:
- 0
- 1 IOU_THRESHOLDS:
- 0.5 NAME: StandardROIHeads NMS_THRESH_TEST: 0.5 NUM_CLASSES: 3 POSITIVE_FRACTION: 0.25 PROPOSAL_APPEND_GT: true SCORE_THRESH_TEST: 0.05 ROI_KEYPOINT_HEAD: CONV_DIMS:
- 512
- 512
- 512
- 512
- 512
- 512
- 512
- 512 LOSS_WEIGHT: 1.0 MIN_KEYPOINTS_PER_IMAGE: 1 NAME: KRCNNConvDeconvUpsampleHead NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: true NUM_KEYPOINTS: 17 POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 0 POOLER_TYPE: ROIAlignV2 ROI_MASK_HEAD: CLS_AGNOSTIC_MASK: false CONV_DIM: 256 FC_DIM: 1024 IN_FEATURES:
- p2 NAME: PointRendMaskHead NORM: '' NUM_CONV: 4 NUM_FC: 2 OUTPUT_SIDE_RESOLUTION: 7 POINT_HEAD_ON: true POOLER_RESOLUTION: 14 POOLER_SAMPLING_RATIO: 0 POOLER_TYPE: '' RPN: BATCH_SIZE_PER_IMAGE: 256 BBOX_REG_LOSS_TYPE: smooth_l1 BBOX_REG_LOSS_WEIGHT: 1.0 BBOX_REG_WEIGHTS: *id001 BOUNDARY_THRESH: -1 CONV_DIMS:
- -1 HEAD_NAME: StandardRPNHead IN_FEATURES:
- p2
- p3
- p4
- p5
- p6 IOU_LABELS:
- 0
- -1
- 1 IOU_THRESHOLDS:
- 0.3
- 0.7 LOSS_WEIGHT: 1.0 NMS_THRESH: 0.7 POSITIVE_FRACTION: 0.5 POST_NMS_TOPK_TEST: 1000 POST_NMS_TOPK_TRAIN: 1000 PRE_NMS_TOPK_TEST: 1000 PRE_NMS_TOPK_TRAIN: 2000 SMOOTH_L1_BETA: 0.0 SEM_SEG_HEAD: COMMON_STRIDE: 4 CONVS_DIM: 128 IGNORE_VALUE: 255 IN_FEATURES:
- p2
- p3
- p4
- p5 LOSS_WEIGHT: 1.0 NAME: SemSegFPNHead NORM: GN NUM_CLASSES: 54 WEIGHTS: /autohome/user/jason/project/aione_dev/seg_preTrain/model_final_rcnn_R_50_FPN_3x.pkl OUTPUT_DIR: ./ckpt5 SEED: -1 SOLVER: AMP: ENABLED: false BASE_LR: 0.02 BIAS_LR_FACTOR: 1.0 CHECKPOINT_PERIOD: 5000 CLIP_GRADIENTS: CLIP_TYPE: value CLIP_VALUE: 1.0 ENABLED: false NORM_TYPE: 2.0 GAMMA: 0.1 IMS_PER_BATCH: 16 LR_SCHEDULER_NAME: WarmupMultiStepLR MAX_ITER: 1000 MOMENTUM: 0.9 NESTEROV: false REFERENCE_WORLD_SIZE: 0 STEPS:
600
800 WARMUP_FACTOR: 0.001 WARMUP_ITERS: 0 WARMUP_METHOD: linear WEIGHT_DECAY: 0.0001 WEIGHT_DECAY_BIAS: null WEIGHT_DECAY_NORM: 0.0 TEST: AUG: ENABLED: false FLIP: true MAX_SIZE: 4000 MIN_SIZES:
- 400
- 500
- 600
- 700
- 800
- 900
- 1000
- 1100
- 1200 DETECTIONS_PER_IMAGE: 100 EVAL_PERIOD: 2 EXPECTED_RESULTS: [] KEYPOINT_OKS_SIGMAS: [] PRECISE_BN: ENABLED: false NUM_ITER: 200 VERSION: 2 VIS_PERIOD: 0

[07/07 09:50:28] detectron2 INFO: Full config saved to ./ckpt5/config.yaml [07/07 09:50:29] d2.utils.env INFO: Using a generated random seed 30318684 [07/07 09:50:30] d2.engine.defaults INFO: Model: GeneralizedRCNN( (backbone): FPN( (fpn_lateral2): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1)) (fpn_output2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (fpn_lateral3): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) (fpn_output3): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (fpn_lateral4): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) (fpn_output4): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (fpn_lateral5): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1)) (fpn_output5): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (top_block): LastLevelMaxPool() (bottom_up): ResNet( (stem): BasicStem( (conv1): Conv2d( 3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05) ) ) (res2): Sequential( (0): BottleneckBlock( (shortcut): Conv2d( 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv1): Conv2d( 64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05) ) (conv2): Conv2d( 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05) ) (conv3): Conv2d( 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) ) (1): BottleneckBlock( (conv1): Conv2d( 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05) ) (conv2): Conv2d( 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05) ) (conv3): Conv2d( 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) ) (2): BottleneckBlock( (conv1): Conv2d( 256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05) ) (conv2): Conv2d( 64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=64, eps=1e-05) ) (conv3): Conv2d( 64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) ) ) (res3): Sequential( (0): BottleneckBlock( (shortcut): Conv2d( 256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) (conv1): Conv2d( 256, 128, kernel_size=(1, 1), stride=(2, 2), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv2): Conv2d( 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv3): Conv2d( 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) ) (1): BottleneckBlock( (conv1): Conv2d( 512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv2): Conv2d( 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv3): Conv2d( 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) ) (2): BottleneckBlock( (conv1): Conv2d( 512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv2): Conv2d( 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv3): Conv2d( 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) ) (3): BottleneckBlock( (conv1): Conv2d( 512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv2): Conv2d( 128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=128, eps=1e-05) ) (conv3): Conv2d( 128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) ) ) (res4): Sequential( (0): BottleneckBlock( (shortcut): Conv2d( 512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05) ) (conv1): Conv2d( 512, 256, kernel_size=(1, 1), stride=(2, 2), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv2): Conv2d( 256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv3): Conv2d( 256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05) ) ) (1): BottleneckBlock( (conv1): Conv2d( 1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv2): Conv2d( 256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv3): Conv2d( 256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05) ) ) (2): BottleneckBlock( (conv1): Conv2d( 1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv2): Conv2d( 256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv3): Conv2d( 256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05) ) ) (3): BottleneckBlock( (conv1): Conv2d( 1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv2): Conv2d( 256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv3): Conv2d( 256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05) ) ) (4): BottleneckBlock( (conv1): Conv2d( 1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv2): Conv2d( 256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv3): Conv2d( 256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05) ) ) (5): BottleneckBlock( (conv1): Conv2d( 1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv2): Conv2d( 256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=256, eps=1e-05) ) (conv3): Conv2d( 256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=1024, eps=1e-05) ) ) ) (res5): Sequential( (0): BottleneckBlock( (shortcut): Conv2d( 1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05) ) (conv1): Conv2d( 1024, 512, kernel_size=(1, 1), stride=(2, 2), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) (conv2): Conv2d( 512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) (conv3): Conv2d( 512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05) ) ) (1): BottleneckBlock( (conv1): Conv2d( 2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) (conv2): Conv2d( 512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) (conv3): Conv2d( 512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05) ) ) (2): BottleneckBlock( (conv1): Conv2d( 2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) (conv2): Conv2d( 512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=512, eps=1e-05) ) (conv3): Conv2d( 512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False (norm): FrozenBatchNorm2d(num_features=2048, eps=1e-05) ) ) ) ) ) (proposal_generator): RPN( (rpn_head): StandardRPNHead( (conv): Conv2d( 256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1) (activation): ReLU() ) (objectness_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1)) (anchor_deltas): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1)) ) (anchor_generator): DefaultAnchorGenerator( (cell_anchors): BufferList() ) ) (roi_heads): StandardROIHeads( (box_pooler): ROIPooler( (level_poolers): ModuleList( (0): ROIAlign(output_size=(7, 7), spatial_scale=0.25, sampling_ratio=0, aligned=True) (1): ROIAlign(output_size=(7, 7), spatial_scale=0.125, sampling_ratio=0, aligned=True) (2): ROIAlign(output_size=(7, 7), spatial_scale=0.0625, sampling_ratio=0, aligned=True) (3): ROIAlign(output_size=(7, 7), spatial_scale=0.03125, sampling_ratio=0, aligned=True) ) ) (box_head): FastRCNNConvFCHead( (flatten): Flatten(start_dim=1, end_dim=-1) (fc1): Linear(in_features=12544, out_features=1024, bias=True) (fc_relu1): ReLU() (fc2): Linear(in_features=1024, out_features=1024, bias=True) (fc_relu2): ReLU() ) (box_predictor): FastRCNNOutputLayers( (cls_score): Linear(in_features=1024, out_features=4, bias=True) (bbox_pred): Linear(in_features=1024, out_features=12, bias=True) ) (mask_head): PointRendMaskHead( (point_head): StandardPointHead( (fc1): Conv1d(259, 256, kernel_size=(1,), stride=(1,)) (fc2): Conv1d(259, 256, kernel_size=(1,), stride=(1,)) (fc3): Conv1d(259, 256, kernel_size=(1,), stride=(1,)) (predictor): Conv1d(259, 3, kernel_size=(1,), stride=(1,)) ) (coarse_head): ConvFCHead( (reduce_spatial_dim_conv): Conv2d(256, 256, kernel_size=(2, 2), stride=(2, 2)) (fc1): Linear(in_features=12544, out_features=1024, bias=True) (fc2): Linear(in_features=1024, out_features=1024, bias=True) (prediction): Linear(in_features=1024, out_features=147, bias=True) ) ) ) ) [07/07 09:50:30] d2.data.dataset_mapper INFO: [DatasetMapper] Augmentations used in training: [RandomRotation(angle=[0, 90, 180, 270], sample_style='choice'), ResizeShortestEdge(short_edge_length=(640, 1024), max_size=1024), <detectron2.projects.point_rend.color_augmentation.ColorAugSSDTransform object at 0x7f1083128410>] [07/07 09:50:30] d2.data.datasets.coco INFO: Loaded 36 images in COCO format from /home/data/ctbc/nhi_segment3/page_train/annotations.json [07/07 09:50:30] d2.data.build INFO: Distribution of instances among all 3 categories: [36m	category	#instances	category	#instances	category	#instances
background	0	HPDF01	0	HAPP01	0

total	0					[0m

[07/07 09:50:30] d2.data.build INFO: Using training sampler TrainingSampler [07/07 09:50:30] d2.data.common INFO: Serializing 36 elements to byte tensors and concatenating them all ... [07/07 09:50:30] d2.data.common INFO: Serialized dataset takes 0.01 MiB [07/07 09:50:30] fvcore.common.checkpoint INFO: [Checkpointer] Loading from /autohome/user/jason/project/aione_dev/seg_preTrain/model_final_rcnn_R_50_FPN_3x.pkl ... [07/07 09:50:31] fvcore.common.checkpoint INFO: Reading a file from 'Detectron2 Model Zoo' [07/07 09:50:31] d2.projects.point_rend.mask_head WARNING: Weight format of PointRend models have changed! Applying automatic conversion now ... [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.box_predictor.cls_score.weight' to the model due to incompatible shapes: (81, 1024) in the checkpoint but (4, 1024) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.box_predictor.cls_score.bias' to the model due to incompatible shapes: (81,) in the checkpoint but (4,) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.box_predictor.bbox_pred.weight' to the model due to incompatible shapes: (320, 1024) in the checkpoint but (12, 1024) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.box_predictor.bbox_pred.bias' to the model due to incompatible shapes: (320,) in the checkpoint but (12,) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.mask_head.coarse_head.prediction.weight' to the model due to incompatible shapes: (3920, 1024) in the checkpoint but (147, 1024) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.mask_head.coarse_head.prediction.bias' to the model due to incompatible shapes: (3920,) in the checkpoint but (147,) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.mask_head.point_head.fc1.weight' to the model due to incompatible shapes: (256, 336, 1) in the checkpoint but (256, 259, 1) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.mask_head.point_head.fc2.weight' to the model due to incompatible shapes: (256, 336, 1) in the checkpoint but (256, 259, 1) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.mask_head.point_head.fc3.weight' to the model due to incompatible shapes: (256, 336, 1) in the checkpoint but (256, 259, 1) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.mask_head.point_head.predictor.weight' to the model due to incompatible shapes: (80, 336, 1) in the checkpoint but (3, 259, 1) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Skip loading parameter 'roi_heads.mask_head.point_head.predictor.bias' to the model due to incompatible shapes: (80,) in the checkpoint but (3,) in the model! You might want to double check if this is expected. [07/07 09:50:31] fvcore.common.checkpoint WARNING: Some model parameters or buffers are not found in the checkpoint: [34mroi_heads.box_predictor.bbox_pred.{bias, weight}[0m [34mroi_heads.box_predictor.cls_score.{bias, weight}[0m [34mroi_heads.mask_head.coarse_head.prediction.{bias, weight}[0m [34mroi_heads.mask_head.point_head.fc1.weight[0m [34mroi_heads.mask_head.point_head.fc2.weight[0m [34mroi_heads.mask_head.point_head.fc3.weight[0m [34mroi_heads.mask_head.point_head.predictor.{bias, weight}[0m [07/07 09:50:31] d2.engine.train_loop INFO: Starting training from iteration 0 [07/07 09:50:35] d2.engine.train_loop ERROR: Exception during training: Traceback (most recent call last): File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/engine/train_loop.py", line 149, in train self.run_step() File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/engine/defaults.py", line 494, in run_step self._trainer.run_step() File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/engine/train_loop.py", line 273, in run_step loss_dict = self.model(data) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, kwargs) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 886, in forward output = self.module(*inputs[0], *kwargs[0]) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(input, kwargs) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/modeling/metaarch/rcnn.py", line 163, in forward , detector_losses = self.roi_heads(images, features, proposals, gt_instances) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(*input, *kwargs) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 743, in forward losses.update(self._forward_mask(features, proposals)) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/modeling/roi_heads/roi_heads.py", line 846, in _forward_mask return self.mask_head(features, instances) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl return forward_call(input, **kwargs) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/projects/point_rend/mask_head.py", line 233, in forward point_coords, point_labels = self._sample_train_points(coarse_mask, instances) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/projects/point_rend/mask_head.py", line 285, in _sample_train_points point_labels = sample_point_labels(instances, point_coords_wrt_image) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/projects/point_rend/point_features.py", line 258, in sample_point_labels point_labels = cat(gt_mask_logits) File "/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2/layers/wrappers.py", line 45, in cat return torch.cat(tensors, dim) NotImplementedError: There were no tensor arguments to this function (e.g., you passed an empty list of Tensors), but no fallback function is registered for schema aten::_cat. This usually means that this function requires a non-empty list of Tensors, or that you (the operator writer) forgot to register a fallback function. Available functions are [CPU, CUDA, QuantizedCPU, BackendSelect, Python, Named, Conjugate, Negative, ADInplaceOrView, AutogradOther, AutogradCPU, AutogradCUDA, AutogradXLA, AutogradLazy, AutogradXPU, AutogradMLC, AutogradHPU, AutogradNestedTensor, AutogradPrivateUse1, AutogradPrivateUse2, AutogradPrivateUse3, Tracer, UNKNOWN_TENSOR_TYPE_ID, Autocast, Batched, VmapMode].

CPU: registered at aten/src/ATen/RegisterCPU.cpp:18433 [kernel] CUDA: registered at aten/src/ATen/RegisterCUDA.cpp:26496 [kernel] QuantizedCPU: registered at aten/src/ATen/RegisterQuantizedCPU.cpp:1068 [kernel] BackendSelect: fallthrough registered at ../aten/src/ATen/core/BackendSelectFallbackKernel.cpp:3 [backend fallback] Python: registered at ../aten/src/ATen/core/PythonFallbackKernel.cpp:47 [backend fallback] Named: registered at ../aten/src/ATen/core/NamedRegistrations.cpp:7 [backend fallback] Conjugate: registered at ../aten/src/ATen/ConjugateFallback.cpp:18 [backend fallback] Negative: registered at ../aten/src/ATen/native/NegateFallback.cpp:18 [backend fallback] ADInplaceOrView: fallthrough registered at ../aten/src/ATen/core/VariableFallbackKernel.cpp:64 [backend fallback] AutogradOther: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradCPU: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradCUDA: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradXLA: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradLazy: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradXPU: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradMLC: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradHPU: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradNestedTensor: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradPrivateUse1: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradPrivateUse2: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] AutogradPrivateUse3: registered at ../torch/csrc/autograd/generated/VariableType_3.cpp:10141 [autograd kernel] Tracer: registered at ../torch/csrc/autograd/generated/TraceType_3.cpp:11560 [kernel] UNKNOWN_TENSOR_TYPE_ID: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:466 [backend fallback] Autocast: fallthrough registered at ../aten/src/ATen/autocast_mode.cpp:305 [backend fallback] Batched: registered at ../aten/src/ATen/BatchingRegistrations.cpp:1016 [backend fallback] VmapMode: fallthrough registered at ../aten/src/ATen/VmapModeRegistrations.cpp:33 [backend fallback]

[07/07 09:50:35] d2.engine.hooks INFO: Total training time: 0:00:03 (0:00:00 on hooks) [07/07 09:50:35] d2.utils.events INFO: iter: 0 lr: N/A max_mem: 6422M


## Expected behavior:
No any runtimeError

## Environment:

Paste the output of the following command:

sys.platform linux Python 3.7.13 (default, Mar 29 2022, 02:18:16) [GCC 7.5.0] numpy 1.21.6 detectron2 0.6 @/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/detectron2 Compiler GCC 7.3 CUDA compiler CUDA 10.2 detectron2 arch flags 3.7, 5.0, 5.2, 6.0, 6.1, 7.0, 7.5 DETECTRON2_ENV_MODULE PyTorch 1.10.1+cu102 @/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/torch PyTorch debug build False GPU available Yes GPU 0,1 Tesla P100-PCIE-16GB (arch=6.0) Driver version 470.103.01 CUDA_HOME /usr/local/cuda Pillow 9.1.1 torchvision 0.11.2+cu102 @/autohome/user/jason/miniconda3/envs/NHI/lib/python3.7/site-packages/torchvision torchvision arch flags 3.5, 5.0, 6.0, 7.0, 7.5 fvcore 0.1.5.post20220512 iopath 0.1.9 cv2 4.4.0

PyTorch built with:

GCC 7.3
C++ Version: 201402
Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications
Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)
OpenMP 201511 (a.k.a. OpenMP 4.5)
LAPACK is enabled (usually provided by MKL)
NNPACK is enabled
CPU capability usage: AVX2
CUDA Runtime 10.2
NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70
CuDNN 7.6.5
Magma 2.5.2
Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=10.2, CUDNN_VERSION=7.6.5, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON,

jasonchenPJ commented 2 years ago

Hi @ppwwyyxx sorry for bothering you, do you have any suggestions for me about this issue?

shining-love commented 2 years ago

Hi @ppwwyyxx sorry for bothering you, do you have any suggestions for me about this issue?

hello,have you solved this problem?

jasonchenPJ commented 2 years ago

Not yet .... I still confused.

carlos-havier commented 2 years ago

Hi guys. I came across the same problem while training PointRend using RandomCrop as augmentation. I think that the issue is risen, similarly to you, when an image (or a crop in my case) has no positive labels in its masks.

I patched point_features.py in anaconda3\envs\hands\Lib\site-packages\detectron2-0.4.1-py3.9-win-amd64.egg\detectron2\projects\point_rend (or wherever your environment is) by changing:

point_labels = cat(gt_mask_logits)

for:

    sR, sP, s2 = point_coords.shape
    assert s2 == 2, point_coords.shape
    if gt_mask_logits:
        point_labels = cat(gt_mask_logits)
    else:
        point_labels = torch.zeros((sR, sP), dtype=point_coords.dtype, layout=point_coords.layout, device=point_coords.device)

Seems to be working fine with my data.

If it also works for you, I suggest someone patches it in Detectron2 and does a pull request.

Cheers

jasonchenPJ commented 2 years ago

Hi @carlos-havier Thanks for your reply. I base on your suggestion to retry the "full no object dataset" case. It could be work now.

Thank you so much~

emmanuel-nwogu commented 2 years ago

I am also having a similar error when trying to train using PointRend:

ERROR [10/19 22:25:15 d2.engine.train_loop]: Exception during training:
Traceback (most recent call last):
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/detectron2/engine/train_loop.py", line 149, in train
    self.run_step()
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/detectron2/engine/defaults.py", line 494, in run_step
    self._trainer.run_step()
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/detectron2/engine/train_loop.py", line 274, in run_step
    loss_dict = self.model(data)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/detectron2/modeling/meta_arch/rcnn.py", line 167, in forward
    _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/detectron2/modeling/roi_heads/roi_heads.py", line 743, in forward
    losses.update(self._forward_mask(features, proposals))
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/detectron2/modeling/roi_heads/roi_heads.py", line 846, in _forward_mask
    return self.mask_head(features, instances)
  File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/projects/PointRend/point_rend/mask_head.py", line 233, in forward
    point_coords, point_labels = self._sample_train_points(coarse_mask, instances)
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/projects/PointRend/point_rend/mask_head.py", line 285, in _sample_train_points
    point_labels = sample_point_labels(instances, point_coords_wrt_image)
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/projects/PointRend/point_rend/point_features.py", line 258, in sample_point_labels
    point_labels = cat(gt_mask_logits)
  File "/content/gdrive/MyDrive/4mlab/detectron2_repo/detectron2/layers/wrappers.py", line 46, in cat
    return torch.cat(tensors, dim)
RuntimeError: torch.cat(): expected a non-empty list of Tensors

Training code:

from detectron2.checkpoint import DetectionCheckpointer

cfg = get_cfg()
# Add PointRend-specific config
point_rend.add_pointrend_config(cfg)
cfg.merge_from_file("/content/gdrive/MyDrive/4mlab/detectron2_repo/projects/PointRend/configs/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco.yaml")
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5  # set threshold for this model
cfg.MODEL.WEIGHTS = "detectron2://PointRend/InstanceSegmentation/pointrend_rcnn_R_50_FPN_3x_coco/164955410/model_final_edd263.pkl"
cfg.DATASETS.TRAIN = ("femur_train",)
cfg.DATASETS.TEST = ()  # "femur_val"
cfg.DATALOADER.NUM_WORKERS = 1
cfg.SOLVER.IMS_PER_BATCH = 1  # This is the real "batch size" commonly known to deep learning people
cfg.SOLVER.BASE_LR = 0.00025  # pick a good LR
cfg.SOLVER.MAX_ITER = 100    # 300 iterations seems good enough for this toy dataset; you will need to train longer for a practical dataset
cfg.SOLVER.STEPS = []        # do not decay learning rate
cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE = 128   # The "RoIHead batch size". 128 is faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1  # only has one class (femur). (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
# NOTE: this config means the number of classes, but a few popular unofficial tutorials incorrect uses num_classes+1 here.
cfg.MODEL.POINT_HEAD.NUM_CLASSES = cfg.MODEL.ROI_HEADS.NUM_CLASSES
cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES = cfg.MODEL.ROI_HEADS.NUM_CLASSES
cfg.TEST.DETECTIONS_PER_IMAGE = 5
cfg.OUTPUT_DIR = r"/content/gdrive/MyDrive/4mlab/checkpoint"

os.makedirs(cfg.OUTPUT_DIR, exist_ok=True)
trainer = DefaultTrainer(cfg) 
trainer.resume_or_load(resume=False)
trainer.train()
checkpointer = DetectionCheckpointer(trainer.model, save_dir=cfg.OUTPUT_DIR)
checkpointer.save("femur_detect_model")

Funnily enough, when I ran this same code months ago (Sept. 29, 2022), it all worked perfectly. I suspect a recent source code change may be suspect.

emmanuel-nwogu commented 2 years ago

I just figured out a surprising fix inspired by this issue comment. I changed cfg.MODEL.ROI_HEADS.NUM_CLASSES = 1 to cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2 despite actually having one class. It works now :)

emmanuel-nwogu commented 2 years ago

Sorry if I derailed this issue. I should have maybe started a new issue. Also, the fix I mentioned above does not work for all cases. When I do use images with no masks like the author, the issue does persist. I haven't tried the fix by @carlos-havier yet. I'm hoping it's a good fix. Thanks, Carlos :)

emmanuel-nwogu commented 2 years ago

Just tried it. It does work great! Thanks again :) @carlos-havier

aymanaboghonim commented 1 year ago

thanks @carlos-havier , you saved my day.

facebookresearch / detectron2

PointRend training error with no object image #4383

Instructions To Reproduce the Issue:

Segmentation Model

Dataset

Result folder

load based config