Closed lingchensimon closed 2 years ago
I have the same problem
Transformer-base network need much more data, maybe the amount of data in the VOC dataset is too small; You can try pre-train
weight in imagenet-1k. if you want to learn how to use pre-train weights, you can refer to collab notebook
the configs mmcls provided are for imagenet-1k dataset, if you want to use in other dataset, you need to find better data preprocessing, and hyperparameters
- Transformer-base network need much more data, maybe the amount of data in the VOC dataset is too small; You can try
pre-train
weight in imagenet-1k. if you want to learn how to use pre-train weights, you can refer tocollab notebook
- the configs mmcls provided are for imagenet-1k dataset, if you want to use in other dataset, you need to find better data preprocessing, and hyperparameters
I will adjust and continue training. Thank you very much for your advice!
Checklist I have searched related issues but cannot get the expected help. I have read related documents and don't know what to do. Describe the question you meet Hi mmcls team, I did a series of experiments to select the model.Under the existing datasets,the swin-transformer's accuracy is much lower than resnet50,Maybe there is something wrong with my configuration file, do I need to change something to use this model? The configuration file and accuracy is pasted below.I would appreciate it if someone could provide any suggestions or solutions for implementation.
Thank you,
swin_tiny_224_b16x64_VOC.py:
model = dict( type='ImageClassifier', backbone=dict( type='SwinTransformer', arch='tiny', img_size=224, drop_path_rate=0.2), neck=dict(type='GlobalAveragePooling', dim=1), head=dict( type='MultiLabelLinearClsHead', num_classes=8, in_channels=768, loss=dict( type='LabelSmoothLoss', label_smooth_val=0.1, mode='original')), init_cfg=[ dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.0), dict(type='Constant', layer='LayerNorm', val=1.0, bias=0.0) ]) dataset_type = 'VOC' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) policies = [ dict(type='AutoContrast'), dict(type='Equalize'), dict(type='Invert'), dict( type='Rotate', interpolation='bicubic', magnitude_key='angle', pad_val=(104, 116, 124), magnitude_range=(0, 30)), dict(type='Posterize', magnitude_key='bits', magnitude_range=(4, 0)), dict(type='Solarize', magnitude_key='thr', magnitude_range=(256, 0)), dict( type='SolarizeAdd', magnitude_key='magnitude', magnitude_range=(0, 110)), dict( type='ColorTransform', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict(type='Contrast', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Brightness', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Sharpness', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Shear', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.3), pad_val=(104, 116, 124), direction='horizontal'), dict( type='Shear', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.3), pad_val=(104, 116, 124), direction='vertical'), dict( type='Translate', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.45), pad_val=(104, 116, 124), direction='horizontal'), dict( type='Translate', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.45), pad_val=(104, 116, 124), direction='vertical') ] train_pipeline = [ dict(type='LoadImageFromFile'), dict( type='RandomResizedCrop', size=224, backend='pillow', interpolation='bicubic'), dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), dict( type='RandAugment', policies=[ dict(type='AutoContrast'), dict(type='Equalize'), dict(type='Invert'), dict( type='Rotate', interpolation='bicubic', magnitude_key='angle', pad_val=(104, 116, 124), magnitude_range=(0, 30)), dict( type='Posterize', magnitude_key='bits', magnitude_range=(4, 0)), dict( type='Solarize', magnitude_key='thr', magnitude_range=(256, 0)), dict( type='SolarizeAdd', magnitude_key='magnitude', magnitude_range=(0, 110)), dict( type='ColorTransform', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Contrast', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Brightness', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Sharpness', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Shear', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.3), pad_val=(104, 116, 124), direction='horizontal'), dict( type='Shear', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.3), pad_val=(104, 116, 124), direction='vertical'), dict( type='Translate', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.45), pad_val=(104, 116, 124), direction='horizontal'), dict( type='Translate', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.45), pad_val=(104, 116, 124), direction='vertical') ], num_policies=2, total_level=10, magnitude_level=9, magnitude_std=0.5), dict( type='RandomErasing', erase_prob=0.25, mode='rand', min_area_ratio=0.02, max_area_ratio=0.3333333333333333, fill_color=[103.53, 116.28, 123.675], fill_std=[57.375, 57.12, 58.395]), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='ToTensor', keys=['gt_label']), dict(type='Collect', keys=['img', 'gt_label']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='Resize', size=224, backend='pillow', interpolation='bicubic'), dict(type='CenterCrop', crop_size=224), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ] data = dict( samples_per_gpu=32, workers_per_gpu=2, train=dict( type='VOC', data_prefix='data/VOCdevkit/VOC2007/', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/train.txt', pipeline=[ dict(type='LoadImageFromFile'), dict( type='RandomResizedCrop', size=224, backend='pillow', interpolation='bicubic'), dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), dict( type='RandAugment', policies=[ dict(type='AutoContrast'), dict(type='Equalize'), dict(type='Invert'), dict( type='Rotate', interpolation='bicubic', magnitude_key='angle', pad_val=(104, 116, 124), magnitude_range=(0, 30)), dict( type='Posterize', magnitude_key='bits', magnitude_range=(4, 0)), dict( type='Solarize', magnitude_key='thr', magnitude_range=(256, 0)), dict( type='SolarizeAdd', magnitude_key='magnitude', magnitude_range=(0, 110)), dict( type='ColorTransform', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Contrast', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Brightness', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Sharpness', magnitude_key='magnitude', magnitude_range=(0, 0.9)), dict( type='Shear', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.3), pad_val=(104, 116, 124), direction='horizontal'), dict( type='Shear', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.3), pad_val=(104, 116, 124), direction='vertical'), dict( type='Translate', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.45), pad_val=(104, 116, 124), direction='horizontal'), dict( type='Translate', interpolation='bicubic', magnitude_key='magnitude', magnitude_range=(0, 0.45), pad_val=(104, 116, 124), direction='vertical') ], num_policies=2, total_level=10, magnitude_level=9, magnitude_std=0.5), dict( type='RandomErasing', erase_prob=0.25, mode='rand', min_area_ratio=0.02, max_area_ratio=0.3333333333333333, fill_color=[103.53, 116.28, 123.675], fill_std=[57.375, 57.12, 58.395]), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='ToTensor', keys=['gt_label']), dict(type='Collect', keys=['img', 'gt_label']) ]), val=dict( type='VOC', data_prefix='data/VOCdevkit/VOC2007/', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt', pipeline=[ dict(type='LoadImageFromFile'), dict( type='Resize', size=224, backend='pillow', interpolation='bicubic'), dict(type='CenterCrop', crop_size=224), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]), test=dict( type='VOC', data_prefix='data/VOCdevkit/VOC2007/', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/test.txt', pipeline=[ dict(type='LoadImageFromFile'), dict( type='Resize', size=224, backend='pillow', interpolation='bicubic'), dict(type='CenterCrop', crop_size=224), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ])) evaluation = dict( interval=1, metric=['mAP', 'CP', 'OP', 'CR', 'OR', 'CF1', 'OF1']) paramwise_cfg = dict( norm_decay_mult=0.0, bias_decay_mult=0.0, custom_keys=dict({ '.absolute_pos_embed': dict(decay_mult=0.0), '.relative_position_bias_table': dict(decay_mult=0.0) })) optimizer = dict( type='AdamW', lr=0.0007, weight_decay=0.0001, eps=1e-08, betas=(0.9, 0.999), paramwise_cfg=dict( norm_decay_mult=0.0, bias_decay_mult=0.0, custom_keys=dict({ '.absolute_pos_embed': dict(decay_mult=0.0), '.relative_position_bias_table': dict(decay_mult=0.0) }))) optimizer_config = dict(grad_clip=dict(max_norm=5.0)) lr_config = dict( policy='CosineAnnealing', by_epoch=False, min_lr_ratio=0.01, warmup='linear', warmup_ratio=0.001, warmup_iters=25040, warmup_by_epoch=False) runner = dict(type='EpochBasedRunner', max_epochs=50) checkpoint_config = dict(interval=10) log_config = dict( interval=100, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] work_dir = './work_dirs/swin_tiny_224_b16x64_VOC' gpu_ids = range(0, 1)
resnet50_b32x1_voc.py:
dataset_type = 'VOC' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='Resize', size=224), dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='ToTensor', keys=['gt_label']), dict(type='Collect', keys=['img', 'gt_label']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='Resize', size=224), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ] data = dict( samples_per_gpu=32, workers_per_gpu=1, train=dict( type='VOC', data_prefix='data/VOCdevkit/VOC2007/', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/train.txt', pipeline=[ dict(type='LoadImageFromFile'), dict(type='Resize', size=224), dict(type='RandomFlip', flip_prob=0.5, direction='horizontal'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='ToTensor', keys=['gt_label']), dict(type='Collect', keys=['img', 'gt_label']) ]), val=dict( type='VOC', data_prefix='data/VOCdevkit/VOC2007/', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/trainval.txt', pipeline=[ dict(type='LoadImageFromFile'), dict(type='Resize', size=224), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]), test=dict( type='VOC', data_prefix='data/VOCdevkit/VOC2007/', ann_file='data/VOCdevkit/VOC2007/ImageSets/Main/test.txt', pipeline=[ dict(type='LoadImageFromFile'), dict(type='Resize', size=224), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ])) evaluation = dict( interval=1, metric=['mAP', 'CP', 'OP', 'CR', 'OR', 'CF1', 'OF1']) checkpoint_config = dict(interval=10) log_config = dict( interval=100, hooks=[dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook')]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) lr_config = dict( policy='CosineAnnealing', min_lr=0, warmup='linear', warmup_iters=5, warmup_ratio=1e-06, warmup_by_epoch=True) runner = dict(type='EpochBasedRunner', max_epochs=100) model = dict( type='ImageClassifier', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(3, ), style='pytorch'), neck=dict(type='GlobalAveragePooling'), head=dict( type='MultiLabelLinearClsHead', num_classes=8, in_channels=2048, loss=dict(type='CrossEntropyLoss', loss_weight=1.0, use_soft=True))) work_dir = './work_dirs/resnet50_b32x1_voc' gpu_ids = range(0, 1)