facebookresearch / dinov2

PyTorch code and models for the DINOv2 self-supervised learning method.
Apache License 2.0
9.05k stars 798 forks source link

A issue about the semantic segmentation notebook? #277

Open MichaleWong opened 1 year ago

MichaleWong commented 1 year ago

image I got a different result on segmentation. My cuda version is 11.8, It's failed to install the mmcv-full 1.5.0 and mmsegmentation 0.27.0 accroding the conda-extra.yaml, instead, pytorch cu11.8, mmcv 2.0.1 and mmsegmentation 1.2.0 and mmengine 0.9.0 are installed successfully. some incompatible bugs caused by different version of mmcv and mmsegmentation are fixed, including the config file. But I still got the wrong segmentation result. who knows what the problem.

====================dinov2_vits14_voc2012_ms_config.py==================================== dataset_type = 'PascalVOCDataset' data_root = '/checkpoint/dino/datasets/VOC2012' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (640, 640) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=(99999999, 640), ratio_range=(1.0, 3.0)), dict(type='RandomCrop', crop_size=(640, 640), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(640, 640), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ]

test_pipeline = [ dict(type='LoadImageFromFile'),

dict(

    #type='MultiScaleFlipAug',
    #img_scale=(99999999, 640),
    #img_ratios=[1.0, 1.32, 1.73, 2.28, 3.0],
    #flip=True,
    #transforms=[
        #dict(type='Resize', scale=(640,480),keep_ratio=True),
        #dict(type='RandomResize', scale=(640,480),ratio_range=(0.5,2.0),keep_ratio=True),
        #dict(type='RandomFlip',prob=0.0),
        #dict(
        #    type='Normalize',
        #    mean=[123.675, 116.28, 103.53],
        #    std=[58.395, 57.12, 57.375],
        #    to_rgb=True),
        #dict(type='ImageToTensor', keys=['img']),
        #dict(type='RandomResize', scale=(640,480),ratio_range=(0.5,2.0),keep_ratio=True),
        dict(type='PackSegInputs')
        #dict(type='ImageToTensor',keys=['img'])
    #])

]

data = dict( samples_per_gpu=2, workers_per_gpu=6, train=dict( type='PascalVOCDataset', data_root='/checkpoint/dino/datasets/VOC2012', img_dir='JPEGImages', ann_dir=['SegmentationClass', 'SegmentationClassAug'], split=[ 'ImageSets/Segmentation/train.txt', 'ImageSets/Segmentation/aug.txt' ], pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict( type='Resize', img_scale=(99999999, 640), ratio_range=(1.0, 3.0)), dict(type='RandomCrop', crop_size=(640, 640), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(640, 640), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ]), val=dict( type='PascalVOCDataset', data_root='/checkpoint/dino/datasets/VOC2012', img_dir='JPEGImages', ann_dir='SegmentationClass', split='ImageSets/Segmentation/val.txt', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(99999999, 640), img_ratios=[1.0, 1.32, 1.73, 2.28, 3.0], flip=True, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='PascalVOCDataset', data_root='/checkpoint/dino/datasets/VOC2012', img_dir='JPEGImages', ann_dir='SegmentationClass', split='ImageSets/Segmentation/val.txt', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(99999999, 640), img_ratios=[1.0, 1.32, 1.73, 2.28, 3.0], flip=True, transforms=[ dict(type='Resize',scale=(640,480), keep_ratio=True), dict(type='RandomFlip',prob=0.0), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), dict(type='PackSegInputs') ]) ])) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] cudnn_benchmark = True optimizer = dict( type='AdamW', lr=0.001, weight_decay=0.0001, betas=(0.9, 0.999)) optimizer_config = dict( type='DistOptimizerHook', update_interval=1, grad_clip=None, coalesce=True, bucket_size_mb=-1, use_fp16=False) lr_config = dict( policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-06, power=1.0, min_lr=0.0, by_epoch=False) runner = dict(type='IterBasedRunner', max_iters=40000) checkpoint_config = dict(by_epoch=False, interval=10000) evaluation = dict(interval=40000, metric='mIoU', pre_eval=True) fp16 = None find_unused_parameters = True norm_cfg = dict(type='SyncBN', requires_grad=True) data_preprocessor = dict( type='SegDataPreProcessor' ) model = dict( type='EncoderDecoder', data_preprocessor=data_preprocessor, pretrained=None, backbone=dict(type='DinoVisionTransformer', out_indices=[8, 9, 10, 11]), decode_head=dict( type='BNHead', in_channels=[384, 384, 384, 384], in_index=[0, 1, 2, 3], input_transform='resize_concat', channels=1536, dropout_ratio=0, num_classes=21, norm_cfg=dict(type='SyncBN', requires_grad=True), align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), test_cfg=dict(mode='slide', crop_size=(796, 796), stride=(160, 160))) auto_resume = True gpu_ids = range(0, 8) work_dir = '/checkpoint/dino/evaluations/segmentation/dinov2_vits14_voc2012_ms'

GZ-YourZY commented 1 year ago

请问您是如何修改的配置文件,可否提供一个dinov2分支,希望进一步交流