I got a different result on segmentation. My cuda version is 11.8, It's failed to install the mmcv-full 1.5.0 and mmsegmentation 0.27.0 accroding the conda-extra.yaml, instead, pytorch cu11.8, mmcv 2.0.1 and mmsegmentation 1.2.0 and mmengine 0.9.0 are installed successfully.
some incompatible bugs caused by different version of mmcv and mmsegmentation are fixed, including the config file. But I still got the wrong segmentation result. who knows what the problem.
I got a different result on segmentation. My cuda version is 11.8, It's failed to install the mmcv-full 1.5.0 and mmsegmentation 0.27.0 accroding the conda-extra.yaml, instead, pytorch cu11.8, mmcv 2.0.1 and mmsegmentation 1.2.0 and mmengine 0.9.0 are installed successfully. some incompatible bugs caused by different version of mmcv and mmsegmentation are fixed, including the config file. But I still got the wrong segmentation result. who knows what the problem.
====================dinov2_vits14_voc2012_ms_config.py==================================== dataset_type = 'PascalVOCDataset' data_root = '/checkpoint/dino/datasets/VOC2012' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (640, 640) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=(99999999, 640), ratio_range=(1.0, 3.0)), dict(type='RandomCrop', crop_size=(640, 640), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(640, 640), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ]
test_pipeline = [ dict(type='LoadImageFromFile'),
dict(
]
data = dict( samples_per_gpu=2, workers_per_gpu=6, train=dict( type='PascalVOCDataset', data_root='/checkpoint/dino/datasets/VOC2012', img_dir='JPEGImages', ann_dir=['SegmentationClass', 'SegmentationClassAug'], split=[ 'ImageSets/Segmentation/train.txt', 'ImageSets/Segmentation/aug.txt' ], pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict( type='Resize', img_scale=(99999999, 640), ratio_range=(1.0, 3.0)), dict(type='RandomCrop', crop_size=(640, 640), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(640, 640), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ]), val=dict( type='PascalVOCDataset', data_root='/checkpoint/dino/datasets/VOC2012', img_dir='JPEGImages', ann_dir='SegmentationClass', split='ImageSets/Segmentation/val.txt', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(99999999, 640), img_ratios=[1.0, 1.32, 1.73, 2.28, 3.0], flip=True, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='PascalVOCDataset', data_root='/checkpoint/dino/datasets/VOC2012', img_dir='JPEGImages', ann_dir='SegmentationClass', split='ImageSets/Segmentation/val.txt', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(99999999, 640), img_ratios=[1.0, 1.32, 1.73, 2.28, 3.0], flip=True, transforms=[ dict(type='Resize',scale=(640,480), keep_ratio=True), dict(type='RandomFlip',prob=0.0), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), dict(type='PackSegInputs') ]) ])) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] cudnn_benchmark = True optimizer = dict( type='AdamW', lr=0.001, weight_decay=0.0001, betas=(0.9, 0.999)) optimizer_config = dict( type='DistOptimizerHook', update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False) lr_config = dict( policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-06, power=1.0, min_lr=0.0, by_epoch=False) runner = dict(type='IterBasedRunner', max_iters=40000) checkpoint_config = dict(by_epoch=False, interval=10000) evaluation = dict(interval=40000, metric='mIoU', pre_eval=True) fp16 = None find_unused_parameters = True norm_cfg = dict(type='SyncBN', requires_grad=True) data_preprocessor = dict( type='SegDataPreProcessor' ) model = dict( type='EncoderDecoder', data_preprocessor=data_preprocessor, pretrained=None, backbone=dict(type='DinoVisionTransformer', out_indices=[8, 9, 10, 11]), decode_head=dict( type='BNHead', in_channels=[384, 384, 384, 384], in_index=[0, 1, 2, 3], input_transform='resize_concat', channels=1536, dropout_ratio=0, num_classes=21, norm_cfg=dict(type='SyncBN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), test_cfg=dict(mode='slide', crop_size=(796, 796), stride=(160, 160))) auto_resume = True gpu_ids = range(0, 8) work_dir = '/checkpoint/dino/evaluations/segmentation/dinov2_vits14_voc2012_ms'