aim-uofa / AdelaiDet

AdelaiDet is an open source toolbox for multiple instance-level detection and recognition tasks.
https://git.io/AdelaiDet
Other
3.37k stars 646 forks source link

Difference between CondInst and BlendMask on Segmentation Loss #286

Closed lxtGH closed 3 years ago

lxtGH commented 3 years ago

Hi! Thanks for opensourcing the code.

Both work use the segmentation loss for extra supervision to the mask feature(blender). What is difference of these two?

From the code, I found that Blend mask considers the background. (80 + 1)

what about adopting the same pipline as CondInst (80 classes)?

pzh11001 commented 3 years ago

I changed all the “NUM_CLASSES” in yaml file to my own number. It works well

an99990 commented 2 years ago

Hi, i have changed all the NUM_CLASSES in my yaml as well but i somehow predict outside my classes ?

here is my cfg please let me know how i can fix this issue

INFO:root:CUDNN_BENCHMARK: False
DATALOADER:
  ASPECT_RATIO_GROUPING: True
  FILTER_EMPTY_ANNOTATIONS: True
  NUM_WORKERS: 4
  REPEAT_THRESHOLD: 0.0
  SAMPLER_TRAIN: TrainingSampler
DATASETS:
  PRECOMPUTED_PROPOSAL_TOPK_TEST: 1000
  PRECOMPUTED_PROPOSAL_TOPK_TRAIN: 2000
  PROPOSAL_FILES_TEST: ()
  PROPOSAL_FILES_TRAIN: ()
  TEST: ('coco_2017_val',)
  TRAIN: ('coco_2017_train',)
GLOBAL:
  HACK: 1.0
INPUT:
  CROP:
    CROP_INSTANCE: True
    ENABLED: False
    SINGLE_CATEGORY_MAX_AREA: 1.0
    SIZE: [0.9, 0.9]
    TYPE: relative_range
  FORMAT: BGR
  HFLIP_TRAIN: True
  MASK_FORMAT: polygon
  MAX_SIZE_TEST: 736
  MAX_SIZE_TRAIN: 900
  MIN_SIZE_TEST: 512
  MIN_SIZE_TRAIN: (256, 288, 320, 352, 384, 416, 448, 480, 512, 544, 576, 608)
  MIN_SIZE_TRAIN_SAMPLING: choice
  RANDOM_FLIP: horizontal
MODEL:
  ANCHOR_GENERATOR:
    ANGLES: [[-90, 0, 90]]
    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]
    NAME: DefaultAnchorGenerator
    OFFSET: 0.0
    SIZES: [[32, 64, 128, 256, 512]]
  BACKBONE:
    ANTI_ALIAS: False
    FREEZE_AT: -1
    NAME: build_fcos_dla_fpn_backbone
  BASIS_MODULE:
    ANN_SET: coco
    COMMON_STRIDE: 8
    CONVS_DIM: 128
    IN_FEATURES: ['p3', 'p4', 'p5']
    LOSS_ON: True
    LOSS_WEIGHT: 0.3
    NAME: ProtoNet
    NORM: SyncBN
    NUM_BASES: 4
    NUM_CLASSES: 1
    NUM_CONVS: 3
  BATEXT:
    CANONICAL_SIZE: 96
    CONV_DIM: 256
    CUSTOM_DICT: 
    IN_FEATURES: ['p2', 'p3', 'p4']
    NUM_CHARS: 25
    NUM_CONV: 2
    POOLER_RESOLUTION: (8, 32)
    POOLER_SCALES: (0.25, 0.125, 0.0625)
    RECOGNITION_LOSS: ctc
    RECOGNIZER: attn
    SAMPLING_RATIO: 1
    USE_AET: False
    USE_COORDCONV: False
    VOC_SIZE: 96
  BLENDMASK:
    ATTN_SIZE: 14
    BOTTOM_RESOLUTION: 56
    INSTANCE_LOSS_WEIGHT: 1.0
    POOLER_SAMPLING_RATIO: 1
    POOLER_SCALES: (0.25,)
    POOLER_TYPE: ROIAlignV2
    TOP_INTERP: bilinear
    VISUALIZE: False
  BOXINST:
    BOTTOM_PIXELS_REMOVED: 10
    ENABLED: False
    PAIRWISE:
      COLOR_THRESH: 0.3
      DILATION: 2
      SIZE: 3
      WARMUP_ITERS: 10000
  BiFPN:
    IN_FEATURES: ['res2', 'res3', 'res4', 'res5']
    NORM: 
    NUM_REPEATS: 6
    OUT_CHANNELS: 160
  CONDINST:
    BOTTOM_PIXELS_REMOVED: -1
    MASK_BRANCH:
      CHANNELS: 128
      IN_FEATURES: ['p3', 'p4', 'p5']
      NORM: BN
      NUM_CONVS: 4
      OUT_CHANNELS: 8
      SEMANTIC_LOSS_ON: False
    MASK_HEAD:
      CHANNELS: 8
      DISABLE_REL_COORDS: False
      NUM_LAYERS: 3
      USE_FP16: False
    MASK_OUT_STRIDE: 4
    MAX_PROPOSALS: -1
    TOPK_PROPOSALS_PER_IM: -1
  DEVICE: cuda
  DLA:
    CONV_BODY: DLA34
    NORM: SyncBN
    OUT_FEATURES: ['stage2', 'stage3', 'stage4', 'stage5']
  FCOS:
    BOX_QUALITY: ctrness
    CENTER_SAMPLE: True
    FPN_STRIDES: [8, 16, 32]
    INFERENCE_TH_TEST: 0.35
    INFERENCE_TH_TRAIN: 0.05
    IN_FEATURES: ['p3', 'p4', 'p5']
    LOC_LOSS_TYPE: giou
    LOSS_ALPHA: 0.25
    LOSS_GAMMA: 2.0
    LOSS_NORMALIZER_CLS: fg
    LOSS_WEIGHT_CLS: 1.0
    NMS_TH: 0.6
    NORM: GN
    NUM_BOX_CONVS: 4
    NUM_CLASSES: 1
    NUM_CLS_CONVS: 4
    NUM_SHARE_CONVS: 0
    POST_NMS_TOPK_TEST: 100
    POST_NMS_TOPK_TRAIN: 100
    POS_RADIUS: 1.5
    PRE_NMS_TOPK_TEST: 1000
    PRE_NMS_TOPK_TRAIN: 1000
    PRIOR_PROB: 0.01
    SIZES_OF_INTEREST: [64, 128]
    THRESH_WITH_CTR: True
    TOP_LEVELS: 0
    USE_DEFORMABLE: False
    USE_RELU: True
    USE_SCALE: False
    YIELD_BOX_FEATURES: False
    YIELD_PROPOSAL: False
  FCPOSE:
    ATTN_LEN: 2737
    BASIS_MODULE:
      BN_TYPE: SyncBN
      COMMON_STRIDE: 8
      CONVS_DIM: 128
      LOSS_WEIGHT: 0.2
      NUM_BASES: 32
      NUM_CLASSES: 17
    DISTANCE_NORM: 12.0
    DYNAMIC_CHANNELS: 32
    FOCAL_LOSS_ALPHA: 0.25
    FOCAL_LOSS_GAMMA: 2.0
    GT_HEATMAP_STRIDE: 2
    HEAD_HEATMAP_SIGMA: 0.01
    HEATMAP_SIGMA: 1.8
    LOSS_WEIGHT_DIRECTION: 9.0
    LOSS_WEIGHT_KEYPOINT: 2.5
    MAX_PROPOSALS: 70
    PROPOSALS_PER_INST: 70
    SIGMA: 1
  FCPOSE_ON: False
  FPN:
    FUSE_TYPE: sum
    IN_FEATURES: ['level3', 'level4', 'level5']
    NORM: 
    OUT_CHANNELS: 256
  KEYPOINT_ON: False
  LOAD_PROPOSALS: False
  MASK_ON: True
  MEInst:
    AGNOSTIC: True
    CENTER_SAMPLE: True
    DIM_MASK: 60
    FLAG_PARAMETERS: False
    FPN_STRIDES: [8, 16, 32, 64, 128]
    GCN_KERNEL_SIZE: 9
    INFERENCE_TH_TEST: 0.35
    INFERENCE_TH_TRAIN: 0.05
    IN_FEATURES: ['p3', 'p4', 'p5', 'p6', 'p7']
    IOU_LABELS: [0, 1]
    IOU_THRESHOLDS: [0.5]
    LAST_DEFORMABLE: False
    LOC_LOSS_TYPE: giou
    LOSS_ALPHA: 0.25
    LOSS_GAMMA: 2.0
    LOSS_ON_MASK: False
    MASK_LOSS_TYPE: mse
    MASK_ON: True
    MASK_SIZE: 28
    NMS_TH: 0.6
    NORM: GN
    NUM_BOX_CONVS: 4
    NUM_CLASSES: 1
    NUM_CLS_CONVS: 4
    NUM_MASK_CONVS: 4
    NUM_SHARE_CONVS: 0
    PATH_COMPONENTS: datasets/coco/components/coco_2017_train_class_agnosticTrue_whitenTrue_sigmoidTrue_60.npz
    POST_NMS_TOPK_TEST: 100
    POST_NMS_TOPK_TRAIN: 100
    POS_RADIUS: 1.5
    PRE_NMS_TOPK_TEST: 1000
    PRE_NMS_TOPK_TRAIN: 1000
    PRIOR_PROB: 0.01
    SIGMOID: True
    SIZES_OF_INTEREST: [64, 128, 256, 512]
    THRESH_WITH_CTR: False
    TOP_LEVELS: 2
    TYPE_DEFORMABLE: DCNv1
    USE_DEFORMABLE: False
    USE_GCN_IN_MASK: False
    USE_RELU: True
    USE_SCALE: True
    WHITEN: True
  META_ARCHITECTURE: BlendMask
  MOBILENET: False
  PANOPTIC_FPN:
    COMBINE:
      ENABLED: False
      INSTANCES_CONFIDENCE_THRESH: 0.35
      OVERLAP_THRESH: 0.5
      STUFF_AREA_LIMIT: 4096
    INSTANCE_LOSS_WEIGHT: 1.0
  PIXEL_MEAN: [103.53, 116.28, 123.675]
  PIXEL_STD: [1.0, 1.0, 1.0]
  PROPOSAL_GENERATOR:
    MIN_SIZE: 0
    NAME: FCOS
  RESNETS:
    DEFORM_INTERVAL: 1
    DEFORM_MODULATED: False
    DEFORM_NUM_GROUPS: 1
    DEFORM_ON_PER_STAGE: [False, False, False, False]
    DEPTH: 50
    NORM: FrozenBN
    NUM_GROUPS: 1
    OUT_FEATURES: ['res3', 'res4', 'res5']
    RES2_OUT_CHANNELS: 256
    RES4_DILATION: 1
    RES5_DILATION: 1
    RES5_MULTI_GRID: [1, 2, 4]
    STEM_OUT_CHANNELS: 64
    STEM_TYPE: deeplab
    STRIDE_IN_1X1: True
    WIDTH_PER_GROUP: 64
  RETINANET:
    BBOX_REG_LOSS_TYPE: smooth_l1
    BBOX_REG_WEIGHTS: (1.0, 1.0, 1.0, 1.0)
    FOCAL_LOSS_ALPHA: 0.25
    FOCAL_LOSS_GAMMA: 2.0
    IN_FEATURES: ['p3', 'p4', 'p5', 'p6', 'p7']
    IOU_LABELS: [0, -1, 1]
    IOU_THRESHOLDS: [0.4, 0.5]
    NMS_THRESH_TEST: 0.5
    NORM: 
    NUM_CLASSES: 1
    NUM_CONVS: 4
    PRIOR_PROB: 0.01
    SCORE_THRESH_TEST: 0.35
    SMOOTH_L1_LOSS_BETA: 0.1
    TOPK_CANDIDATES_TEST: 1000
  ROI_BOX_CASCADE_HEAD:
    BBOX_REG_WEIGHTS: ((10.0, 10.0, 5.0, 5.0), (20.0, 20.0, 10.0, 10.0), (30.0, 30.0, 15.0, 15.0))
    IOUS: (0.5, 0.6, 0.7)
  ROI_BOX_HEAD:
    BBOX_REG_LOSS_TYPE: smooth_l1
    BBOX_REG_LOSS_WEIGHT: 1.0
    BBOX_REG_WEIGHTS: (10.0, 10.0, 5.0, 5.0)
    CLS_AGNOSTIC_BBOX_REG: False
    CONV_DIM: 256
    FC_DIM: 1024
    NAME: 
    NORM: 
    NUM_CONV: 0
    NUM_FC: 0
    POOLER_RESOLUTION: 14
    POOLER_SAMPLING_RATIO: 0
    POOLER_TYPE: ROIAlignV2
    SMOOTH_L1_BETA: 0.0
    TRAIN_ON_PRED_BOXES: False
  ROI_HEADS:
    BATCH_SIZE_PER_IMAGE: 512
    IN_FEATURES: ['res4']
    IOU_LABELS: [0, 1]
    IOU_THRESHOLDS: [0.5]
    NAME: Res5ROIHeads
    NMS_THRESH_TEST: 0.5
    NUM_CLASSES: 1
    POSITIVE_FRACTION: 0.25
    PROPOSAL_APPEND_GT: True
    SCORE_THRESH_TEST: 0.35
  ROI_KEYPOINT_HEAD:
    CONV_DIMS: (512, 512, 512, 512, 512, 512, 512, 512)
    LOSS_WEIGHT: 1.0
    MIN_KEYPOINTS_PER_IMAGE: 1
    NAME: KRCNNConvDeconvUpsampleHead
    NORMALIZE_LOSS_BY_VISIBLE_KEYPOINTS: True
    NUM_KEYPOINTS: 17
    POOLER_RESOLUTION: 14
    POOLER_SAMPLING_RATIO: 0
    POOLER_TYPE: ROIAlignV2
  ROI_MASK_HEAD:
    CLS_AGNOSTIC_MASK: False
    CONV_DIM: 256
    NAME: MaskRCNNConvUpsampleHead
    NORM: 
    NUM_CONV: 0
    POOLER_RESOLUTION: 14
    POOLER_SAMPLING_RATIO: 0
    POOLER_TYPE: ROIAlignV2
  RPN:
    BATCH_SIZE_PER_IMAGE: 256
    BBOX_REG_LOSS_TYPE: smooth_l1
    BBOX_REG_LOSS_WEIGHT: 1.0
    BBOX_REG_WEIGHTS: (1.0, 1.0, 1.0, 1.0)
    BOUNDARY_THRESH: -1
    CONV_DIMS: [-1]
    HEAD_NAME: StandardRPNHead
    IN_FEATURES: ['res4']
    IOU_LABELS: [0, -1, 1]
    IOU_THRESHOLDS: [0.3, 0.7]
    LOSS_WEIGHT: 1.0
    NMS_THRESH: 0.7
    POSITIVE_FRACTION: 0.5
    POST_NMS_TOPK_TEST: 1000
    POST_NMS_TOPK_TRAIN: 2000
    PRE_NMS_TOPK_TEST: 6000
    PRE_NMS_TOPK_TRAIN: 12000
    SMOOTH_L1_BETA: 0.0
  SEM_SEG_HEAD:
    ASPP_CHANNELS: 256
    ASPP_DILATIONS: [6, 12, 18]
    ASPP_DROPOUT: 0.1
    COMMON_STRIDE: 4
    CONVS_DIM: 128
    IGNORE_VALUE: 255
    IN_FEATURES: ['p2', 'p3', 'p4', 'p5']
    LOSS_TYPE: hard_pixel_mining
    LOSS_WEIGHT: 1.0
    NAME: SemSegFPNHead
    NORM: GN
    NUM_CLASSES: 54
    PROJECT_CHANNELS: [48]
    PROJECT_FEATURES: ['res2']
    USE_DEPTHWISE_SEPARABLE_CONV: False
  SOLOV2:
    FPN_INSTANCE_STRIDES: [8, 8, 16, 32, 32]
    FPN_SCALE_RANGES: ((1, 96), (48, 192), (96, 384), (192, 768), (384, 2048))
    INSTANCE_CHANNELS: 512
    INSTANCE_IN_CHANNELS: 256
    INSTANCE_IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
    LOSS:
      DICE_WEIGHT: 3.0
      FOCAL_ALPHA: 0.25
      FOCAL_GAMMA: 2.0
      FOCAL_USE_SIGMOID: True
      FOCAL_WEIGHT: 1.0
    MASK_CHANNELS: 128
    MASK_IN_CHANNELS: 256
    MASK_IN_FEATURES: ['p2', 'p3', 'p4', 'p5']
    MASK_THR: 0.5
    MAX_PER_IMG: 100
    NMS_KERNEL: gaussian
    NMS_PRE: 500
    NMS_SIGMA: 2
    NMS_TYPE: matrix
    NORM: GN
    NUM_CLASSES: 1
    NUM_GRIDS: [40, 36, 24, 16, 12]
    NUM_INSTANCE_CONVS: 4
    NUM_KERNELS: 256
    NUM_MASKS: 256
    PRIOR_PROB: 0.01
    SCORE_THR: 0.1
    SIGMA: 0.2
    TYPE_DCN: DCN
    UPDATE_THR: 0.05
    USE_COORD_CONV: True
    USE_DCN_IN_INSTANCE: False
  TOP_MODULE:
    DIM: 16
    NAME: conv
  VOVNET:
    BACKBONE_OUT_CHANNELS: 256
    CONV_BODY: V-39-eSE
    NORM: FrozenBN
    OUT_CHANNELS: 256
    OUT_FEATURES: ['stage2', 'stage3', 'stage4', 'stage5']
  WEIGHTS: models_weights/DLA_34_syncbn_4x.pth
OUTPUT_DIR: output/blendmask/DLA_34_syncbn_4x
SEED: -1
SOLVER:
  AMP:
    ENABLED: False
  BASE_LR: 0.01
  BASE_LR_END: 0.0
  BIAS_LR_FACTOR: 1.0
  CHECKPOINT_PERIOD: 5000
  CLIP_GRADIENTS:
    CLIP_TYPE: value
    CLIP_VALUE: 1.0
    ENABLED: False
    NORM_TYPE: 2.0
  GAMMA: 0.1
  IMS_PER_BATCH: 16
  LR_SCHEDULER_NAME: WarmupMultiStepLR
  MAX_ITER: 360000
  MOMENTUM: 0.9
  NESTEROV: False
  POLY_LR_CONSTANT_ENDING: 0.0
  POLY_LR_POWER: 0.9
  REFERENCE_WORLD_SIZE: 0
  STEPS: (300000, 340000)
  WARMUP_FACTOR: 0.001
  WARMUP_ITERS: 1000
  WARMUP_METHOD: linear
  WEIGHT_DECAY: 0.0001
  WEIGHT_DECAY_BIAS: None
  WEIGHT_DECAY_NORM: 0.0
TEST:
  AUG:
    ENABLED: False
    FLIP: True
    MAX_SIZE: 4000
    MIN_SIZES: (400, 500, 600, 700, 800, 900, 1000, 1100, 1200)
  DETECTIONS_PER_IMAGE: 100
  EVAL_PERIOD: 0
  EXPECTED_RESULTS: []
  KEYPOINT_OKS_SIGMAS: []
  PRECISE_BN:
    ENABLED: False
    NUM_ITER: 200
VERSION: 2
VIS_PERIOD: 0