Train schedule is lager than 24, why it stopped at 24 epoch automaticly?

reedwiththought commented 3 years ago

Hi, I want train Cityscapes for longer and set total_epochs = 64 as flollows: lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[48, 60]) total_epochs = 64

but it stopped at 24 epoch: 2020-11-11 00:07:42,660 - mmdet - INFO - Epoch [24][250/371] lr: 1.000e-02, eta: 1:31:00, time: 1.828, data_time: 0.440, 2020-11-11 06:21:29,255 - mmdet - INFO - Saving checkpoint at 24 epochs 2020-11-11 06:24:03,237 - mmdet - INFO - Evaluating panoptic...

I try again and get same result. Could tell me this is why? Looking forward to your reply.

chensnathan commented 3 years ago

That's weird. Could you upload your whole config file?

reedwiththought commented 3 years ago

OK，my config file is as follows:

model = dict( type='SpatialFlow', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs='on_input', num_outs=5), bbox_head=dict( type='SpatialFlowHead', num_classes=8, in_channels=256, stacked_convs=4, stacked_mask_convs=1, stacked_stuff_convs=4, dcn_cls_convs_idx=None, dcn_reg_convs_idx=None, dcn_mask_convs_idx=None, dcn_stuff_convs_idx=None, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', octave_base_scale=4, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[8, 16, 32, 64, 128]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=8, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)), stuff_head=dict( type='StuffHead', stuff_num_classes=12, in_channels=256, feat_channels=128, feat_indexes=[0, 1, 2], feat_strides=[8, 16, 32], out_stride=4, conv_cfg=dict(type='DCNv2'), norm_cfg=dict(type='GN', num_groups=32), loss_stuff=dict( type='CrossEntropyLoss', ignore_index=255, loss_weight=0.25))) train_cfg = dict( single_stage=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.4, min_pos_iou=0, ignore_iof_thr=-1), allowed_border=-1, pos_weight=-1, debug=False), single_stage_nms=dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100), single_stage_mask=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict(type='PseudoSampler', add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)) test_cfg = dict( single_stage=dict( nms_pre=1000, min_bbox_size=0, score_thr=0.05, nms=dict(type='nms', iou_threshold=0.6), max_per_img=100), single_stage_mask=dict(mask_thr_binary=0.5)) confidence_thr = 0.37 overlap_thr = 0.37 stuff_area_limit = 2048 using_bbox = True bbox_overlap_thr = 0.5 dataset_type = 'CityscapesDataset' data_root = 'data/cityscapes/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict( type='Resize', img_scale=[(2048, 512), (2048, 1024)], keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='SegRescale', scale_factor=1.0), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='CityscapesDataset', ann_file= 'data/cityscapes/annotations/panoptic_train2017_detection_format_things_only.json', img_prefix='data/cityscapes/leftImg8bit/train/', with_panoptic=True, things_other=True, pipeline=[ dict(type='LoadImageFromFile'), dict( type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True), dict( type='Resize', img_scale=[(2048, 512), (2048, 1024)], keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='SegRescale', scale_factor=1.0), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=[ 'img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg' ]) ], seg_prefix= 'data/cityscapes/annotations/panoptic_train2017_semantic_segmentation_things_other_pngs/' ), val=dict( type='CityscapesDataset', ann_file= 'data/cityscapes/annotations/panoptic_val2017_detection_format_things_only.json', img_prefix='data/cityscapes/leftImg8bit/val/', with_panoptic=True, things_other=True, pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='CityscapesDataset', ann_file= 'data/cityscapes/annotations/panoptic_val2017_detection_format_things_only.json', img_prefix='data/cityscapes/leftImg8bit/val/', with_panoptic=True, things_other=True, pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ])) evaluation = dict(metric=['panoptic']) images_json_file = 'data/cityscapes/annotations/panoptic_val2017_detection_format_things_only.json' categories_json_file = 'data/cityscapes/annotations/panoptic_cityscape_categories.json' gt_json_file = 'data/cityscapes/annotations/panoptic_val2017.json' gt_folder = 'data/cityscapes/annotations/panoptic_val2017/' optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[48, 60]) total_epochs = 64 checkpoint_config = dict(interval=1) log_config = dict(interval=250, hooks=[dict(type='TextLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] segmentations_folder = './work_dirs/spatialflow_r50_fpn_20e_cityscape/segmentations_folder_val_pred/' panoptic_json_file = './work_dirs/spatialflow_r50_fpn_20e_cityscape/panoptic_val_pred.json' work_dir = './work_dirs/spatialflow_r50_fpn_20e_cityscape' gpu_ids = range(0, 1)

chensnathan commented 3 years ago

This may because of OOM when evaluating the checkpoint during training. Try to add --no-validate for training and test the checkpoint manually after the training finish.

reedwiththought commented 3 years ago

OK, thank you very much. I add this and try it again.

chensnathan commented 3 years ago

To check, you can also resume the epoch_24.pth for further training, and this phenomenon will show up again in epoch 48.

reedwiththought commented 3 years ago

OK, I will also try this. Thanks your advice.

chensnathan / SpatialFlow

Train schedule is lager than 24, why it stopped at 24 epoch automaticly? #6