open-mmlab / mmdetection

OpenMMLab Detection Toolbox and Benchmark
https://mmdetection.readthedocs.io
Apache License 2.0
29.52k stars 9.45k forks source link

A Bug in Downstream task #10351

Open wangzhaoyang-508 opened 1 year ago

wangzhaoyang-508 commented 1 year ago

I pretrained a Swin-Large in MMpretrain using the SIMmim method, but a bug occurred when I used the .pth file for the downstream object detection task (DINO).

the log is bml@jupyter-1ae41f6b2e337d68-0:~/storage/wzy/projects/mmpretrain$ bash ./tools/benchmarks/mmdetection/mim_dist_train_fpn.sh ./configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py ./work_dirs/simmim_swin-large-w12_16xb128-amp-coslr-800e_in1k-192px/epoch_799.pth 4

Runtime environment: cudnn_benchmark: False mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} dist_cfg: {'backend': 'nccl'} seed: None Distributed launcher: pytorch Distributed training: True GPU number: 4

05/18 10:53:20 - mmengine - INFO - Config: dataset_type = 'CocoDataset' data_root = '/home/bml/storage/wzy/dataset/datasets/EL/cocox4/' backend_args = None train_pipeline = [ dict(type='LoadImageFromFile', backend_args=None), dict(type='LoadAnnotations', with_bbox=True), dict(type='RandomFlip', prob=0.5), dict( type='RandomChoice', transforms=[[{ 'type': 'RandomChoiceResize', 'scales': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'keep_ratio': True }], [{ 'type': 'RandomChoiceResize', 'scales': [(400, 4200), (500, 4200), (600, 4200)], 'keep_ratio': True }, { 'type': 'RandomCrop', 'crop_type': 'absolute_range', 'crop_size': (384, 600), 'allow_negative_crop': True }, { 'type': 'RandomChoiceResize', 'scales': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'keep_ratio': True }]]), dict(type='PackDetInputs') ] test_pipeline = [ dict(type='LoadImageFromFile', backend_args=None), dict(type='Resize', scale=(1333, 800), keep_ratio=True), dict(type='LoadAnnotations', with_bbox=True), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ] train_dataloader = dict( batch_size=1, num_workers=4, persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), batch_sampler=dict(type='AspectRatioBatchSampler'), dataset=dict( type='CocoDataset', data_root='/home/bml/storage/wzy/dataset/datasets/EL/cocox4/', ann_file='annotations/instances_train2017.json', data_prefix=dict(img='train2017/'), filter_cfg=dict(filter_empty_gt=False, min_size=32), pipeline=[ dict(type='LoadImageFromFile', backend_args=None), dict(type='LoadAnnotations', with_bbox=True), dict(type='RandomFlip', prob=0.5), dict( type='RandomChoice', transforms=[[{ 'type': 'RandomChoiceResize', 'scales': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'keep_ratio': True }], [{ 'type': 'RandomChoiceResize', 'scales': [(400, 4200), (500, 4200), (600, 4200)], 'keep_ratio': True }, { 'type': 'RandomCrop', 'crop_type': 'absolute_range', 'crop_size': (384, 600), 'allow_negative_crop': True }, { 'type': 'RandomChoiceResize', 'scales': [(480, 1333), (512, 1333), (544, 1333), (576, 1333), (608, 1333), (640, 1333), (672, 1333), (704, 1333), (736, 1333), (768, 1333), (800, 1333)], 'keep_ratio': True }]]), dict(type='PackDetInputs') ], backend_args=None)) val_dataloader = dict( batch_size=1, num_workers=2, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type='CocoDataset', data_root='/home/bml/storage/wzy/dataset/datasets/EL/cocox4/', ann_file='annotations/instances_val2017.json', data_prefix=dict(img='val2017/'), test_mode=True, pipeline=[ dict(type='LoadImageFromFile', backend_args=None), dict(type='Resize', scale=(1333, 800), keep_ratio=True), dict(type='LoadAnnotations', with_bbox=True), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ], backend_args=None)) test_dataloader = dict( batch_size=1, num_workers=2, persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( type='CocoDataset', data_root='/home/bml/storage/wzy/dataset/datasets/EL/cocox4/', ann_file='annotations/instances_val2017.json', data_prefix=dict(img='val2017/'), test_mode=True, pipeline=[ dict(type='LoadImageFromFile', backend_args=None), dict(type='Resize', scale=(1333, 800), keep_ratio=True), dict(type='LoadAnnotations', with_bbox=True), dict( type='PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor')) ], backend_args=None)) val_evaluator = dict( type='CocoMetric', ann_file= '/home/bml/storage/wzy/dataset/datasets/EL/cocox4/annotations/instances_val2017.json', metric='bbox', format_only=False, backend_args=None) test_evaluator = dict( type='CocoMetric', ann_file= '/home/bml/storage/wzy/dataset/datasets/EL/cocox4/annotations/instances_val2017.json', metric='bbox', format_only=False, backend_args=None) default_scope = 'mmdet' default_hooks = dict( timer=dict(type='IterTimerHook'), logger=dict(type='LoggerHook', interval=100), param_scheduler=dict(type='ParamSchedulerHook'), checkpoint=dict(type='CheckpointHook', interval=1), sampler_seed=dict(type='DistSamplerSeedHook'), visualization=dict(type='DetVisualizationHook')) env_cfg = dict( cudnn_benchmark=False, mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), dist_cfg=dict(backend='nccl')) vis_backends = [dict(type='LocalVisBackend')] visualizer = dict( type='DetLocalVisualizer', vis_backends=[dict(type='LocalVisBackend')], name='visualizer') log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True) log_level = 'INFO' load_from = None resume = False model = dict( type='DINO', num_queries=900, with_box_refine=True, as_two_stage=True, data_preprocessor=dict( type='DetDataPreprocessor', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], bgr_to_rgb=True, pad_size_divisor=1), backbone=dict( type='SwinTransformer', pretrain_img_size=384, embed_dims=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=True, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= './work_dirs/simmim_swin-large-w12_16xb128-amp-coslr-800e_in1k-192px/epoch_799.pth', prefix='backbone.')), neck=dict( type='ChannelMapper', in_channels=[192, 384, 768, 1536], kernel_size=1, out_channels=256, act_cfg=None, norm_cfg=dict(type='GN', num_groups=32), num_outs=5), encoder=dict( num_layers=6, layer_cfg=dict( self_attn_cfg=dict(embed_dims=256, num_levels=5, dropout=0.0), ffn_cfg=dict( embed_dims=256, feedforward_channels=2048, ffn_drop=0.0))), decoder=dict( num_layers=6, return_intermediate=True, layer_cfg=dict( self_attn_cfg=dict(embed_dims=256, num_heads=8, dropout=0.0), cross_attn_cfg=dict(embed_dims=256, num_levels=5, dropout=0.0), ffn_cfg=dict( embed_dims=256, feedforward_channels=2048, ffn_drop=0.0)), post_norm_cfg=None), positional_encoding=dict( num_feats=128, normalize=True, offset=0.0, temperature=20), bbox_head=dict( type='DINOHead', num_classes=8, sync_cls_avg_factor=True, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0)), dn_cfg=dict( label_noise_scale=0.5, box_noise_scale=1.0, group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), train_cfg=dict( assigner=dict( type='HungarianAssigner', match_costs=[ dict(type='FocalLossCost', weight=2.0), dict(type='BBoxL1Cost', weight=5.0, box_format='xywh'), dict(type='IoUCost', iou_mode='giou', weight=2.0) ])), test_cfg=dict(max_per_img=300), num_feature_levels=5) optim_wrapper = dict( type='OptimWrapper', optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0001), clip_grad=dict(max_norm=0.1, norm_type=2), paramwise_cfg=dict(custom_keys=dict(backbone=dict(lr_mult=0.1)))) max_epochs = 36 train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=36, val_interval=1) val_cfg = dict(type='ValLoop') test_cfg = dict(type='TestLoop') param_scheduler = [ dict( type='MultiStepLR', begin=0, end=36, by_epoch=True, milestones=[27, 33], gamma=0.1) ] auto_scale_lr = dict(base_batch_size=4) fp16 = dict(loss_scale=512.0) pretrained = 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth' num_levels = 5 launcher = 'pytorch' work_dir = './work_dirs/dino-5scale_swin-l_8xb2-36e_coco'

the error is 5/18 10:38:04 - mmengine - INFO - paramwise_options -- backbone.norm3.weight:lr_mult=0.1 05/18 10:38:04 - mmengine - INFO - paramwise_options -- backbone.norm3.bias:lr=1e-05 05/18 10:38:04 - mmengine - INFO - paramwise_options -- backbone.norm3.bias:weight_decay=0.0001 05/18 10:38:04 - mmengine - INFO - paramwise_options -- backbone.norm3.bias:lr_mult=0.1 loading annotations into memory... Done (t=0.27s) creating index... index created! loading annotations into memory... Done (t=0.26s) creating index... index created! 05/18 10:38:08 - mmengine - INFO - Loads checkpoint by local backend from path: ./work_dirs/simmim_swin-large-w12_16xb128-amp-coslr-800e_in1k-192px/epoch_799.pth Traceback (most recent call last): File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 133, in Traceback (most recent call last): File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 133, in Traceback (most recent call last): File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 133, in Traceback (most recent call last): File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 133, in main() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 129, in main main() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 129, in main main() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 129, in main main() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py", line 129, in main runner.train() File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 1692, in train self._init_model_weights() File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 899, in _init_model_weights model.init_weights() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/detectors/dino.py", line 71, in init_weights runner.train() File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 1692, in train runner.train() File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 1692, in train runner.train() File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 1692, in train self._init_model_weights() File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 899, in _init_model_weights self._init_model_weights()model.init_weights()

File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 899, in _init_model_weights File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/detectors/dino.py", line 71, in init_weights self._init_model_weights() File "/opt/conda/lib/python3.7/site-packages/mmengine/runner/runner.py", line 899, in _init_model_weights model.init_weights() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/detectors/dino.py", line 71, in init_weights model.init_weights() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/detectors/dino.py", line 71, in init_weights super(DeformableDETR, self).init_weights() File "/opt/conda/lib/python3.7/site-packages/mmengine/model/base_module.py", line 130, in init_weights super(DeformableDETR, self).init_weights() File "/opt/conda/lib/python3.7/site-packages/mmengine/model/base_module.py", line 130, in init_weights m.init_weights() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/backbones/swin.py", line 728, in init_weights m.init_weights() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/backbones/swin.py", line 728, in init_weights super(DeformableDETR, self).init_weights() File "/opt/conda/lib/python3.7/site-packages/mmengine/model/base_module.py", line 130, in init_weights m.init_weights() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/backbones/swin.py", line 728, in init_weights super(DeformableDETR, self).init_weights() File "/opt/conda/lib/python3.7/site-packages/mmengine/model/base_module.py", line 130, in init_weights m.init_weights() File "/home/bml/storage/wzy/projects/mmdetection/mmdet/models/backbones/swin.py", line 728, in init_weights table_current = self.state_dict()[table_key] KeyError: 'backbone.stages.0.blocks.0.attn.w_msa.relative_position_bias_table' table_current = self.state_dict()[table_key] KeyError: 'backbone.stages.0.blocks.0.attn.w_msa.relative_position_bias_table' table_current = self.state_dict()[table_key] KeyError: 'backbone.stages.0.blocks.0.attn.w_msa.relative_position_bias_table' table_current = self.state_dict()[table_key] KeyError: 'backbone.stages.0.blocks.0.attn.w_msa.relative_position_bias_table' ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 39229) of binary: /opt/conda/bin/python Traceback (most recent call last): File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in main() File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main launch(args) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch run(args) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/run.py", line 713, in run )(*cmd_args) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in call return launch_agent(self._config, self._entrypoint, list(args)) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent failures=result.failures, torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py FAILED

Failures: [1]: time : 2023-05-18_10:38:19 host : jupyter-1ae41f6b2e337d68-0 rank : 1 (local_rank: 1) exitcode : 1 (pid: 39230) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [2]: time : 2023-05-18_10:38:19 host : jupyter-1ae41f6b2e337d68-0 rank : 2 (local_rank: 2) exitcode : 1 (pid: 39231) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html [3]: time : 2023-05-18_10:38:19 host : jupyter-1ae41f6b2e337d68-0 rank : 3 (local_rank: 3) exitcode : 1 (pid: 39232) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Root Cause (first observed failure): [0]: time : 2023-05-18_10:38:19 host : jupyter-1ae41f6b2e337d68-0 rank : 0 (local_rank: 0) exitcode : 1 (pid: 39229) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html

Traceback (most recent call last): File "/home/bml/storage/.local/bin/mim", line 8, in sys.exit(cli()) File "/home/bml/storage/.local/lib/python3.7/site-packages/click/core.py", line 1130, in call return self.main(args, kwargs) File "/home/bml/storage/.local/lib/python3.7/site-packages/click/core.py", line 1055, in main rv = self.invoke(ctx) File "/home/bml/storage/.local/lib/python3.7/site-packages/click/core.py", line 1657, in invoke return _process_result(sub_ctx.command.invoke(sub_ctx)) File "/home/bml/storage/.local/lib/python3.7/site-packages/click/core.py", line 1404, in invoke return ctx.invoke(self.callback, ctx.params) File "/home/bml/storage/.local/lib/python3.7/site-packages/click/core.py", line 760, in invoke return __callback(args, **kwargs) File "/home/bml/storage/.local/lib/python3.7/site-packages/mim/commands/train.py", line 111, in cli other_args=other_args) File "/home/bml/storage/.local/lib/python3.7/site-packages/mim/commands/train.py", line 262, in train cmd, env=dict(os.environ, MASTER_PORT=str(port))) File "/opt/conda/lib/python3.7/subprocess.py", line 363, in check_call raise CalledProcessError(retcode, cmd) subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-m', 'torch.distributed.launch', '--nproc_per_node=4', '--master_port=28365', '/home/bml/storage/wzy/projects/mmdetection/mmdet/.mim/tools/train.py', '/home/bml/storage/wzy/projects/mmdetection/configs/dino/dino-5scale_swin-l_8xb2-36e_coco.py', '--launcher', 'pytorch', '--cfg-options', 'model.backbone.init_cfg.type=Pretrained', 'model.backbone.init_cfg.checkpoint=./work_dirs/simmim_swin-large-w12_16xb128-amp-coslr-800e_in1k-192px/epoch_799.pth', 'model.backbone.init_cfg.prefix=backbone.']' returned non-zero exit status 1. bml@jupyter-1ae41f6b2e337d68-0:~/storage/wzy/projects/mmpretrain$

I think the key is “KeyError: 'backbone.stages.0.blocks.0.attn.w_msa.relative_position_bias_table' table_current = self.state_dict()[table_key]” but i can not fix it. could you pls help me?

sofiapfund commented 8 months ago

I get the same error. Did you manage to solve it?