OFA-Sys / ONE-PEACE

A general representation model across vision, audio, language modalities. Paper: ONE-PEACE: Exploring One General Representation Model Toward Unlimited Modalities
Apache License 2.0
981 stars 64 forks source link

Cityscapes Train #27

Closed RYHSmmc closed 1 year ago

RYHSmmc commented 1 year ago

Hello, thanks for your great work. When I train ONE-PEASE on cityscapes for segmentation, I get terrible result of 0.02(iou) at a 5000 training steps. Any body know why? I have change the number of class to 19

simonJJJ commented 1 year ago

Hi @RYHSmmc, thanks for your interests.

Please provide more context like your environments, config file, the running cmd, etc.

RYHSmmc commented 1 year ago

@simonJJJ thanks for your reply, the config is listed

RYHSmmc commented 1 year ago

TorchVision: 0.10.1+cu111 OpenCV: 4.7.0 MMCV: 1.5.0 MMCV Compiler: GCC 9.4 MMCV CUDA Compiler: 11.1 MMSegmentation: 0.30.0+unknown 2023-08-23 07:06:06,595 - mmseg - INFO - Distributed training: True 2023-08-23 07:06:07,354 - mmseg - INFO - Config: num_things_classes = 8 num_stuff_classes = 11 num_classes = 19 norm_cfg = dict(type='SyncBN', requires_grad=True) custom_imports = dict( imports=['optimizer.onepeace_layer_decay_optimizer_constructor'], allow_failed_imports=False) model = dict( type='EncoderDecoderMask2Former', pretrained= 'xxxxx/workspace/code/cityscapes/ONE-PEACE-main/pretrained/one-peace-vision.pkl', backbone=dict( type='OnePeaceAdapter', attention_heads=24, bucket_size=56, dropout=0.0, embed_dim=1536, ffn_embed_dim=6144, layers=40, use_checkpoint=True, rp_bias=True, shared_rp_bias=False, init_values=1e-06, drop_path_rate=0.5, conv_inplane=64, n_points=4, deform_num_heads=24, cffn_ratio=0.25, deform_ratio=0.5, with_cp=True, interaction_indexes=[[0, 9], [10, 19], [20, 29], [30, 39]]), decode_head=dict( type='Mask2FormerHead', in_channels=[1536, 1536, 1536, 1536], feat_channels=1024, out_channels=1024, in_index=[0, 1, 2, 3], num_things_classes=8, num_stuff_classes=11, num_queries=200, num_transformer_feat_level=3, pixel_decoder=dict( type='MSDeformAttnPixelDecoder', num_outs=3, norm_cfg=dict(type='GN', num_groups=32), act_cfg=dict(type='ReLU'), encoder=dict( type='DetrTransformerEncoder', num_layers=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='MultiScaleDeformableAttention', embed_dims=1024, num_heads=32, num_levels=3, num_points=4, im2col_step=64, dropout=0.0, batch_first=False, norm_cfg=None, init_cfg=None), ffn_cfgs=dict( type='FFN', embed_dims=1024, feedforward_channels=4096, num_fcs=2, ffn_drop=0.0, act_cfg=dict(type='ReLU', inplace=True), with_cp=True), operation_order=('self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), positional_encoding=dict( type='SinePositionalEncoding', num_feats=512, normalize=True), init_cfg=None), enforce_decoder_input_project=False, positional_encoding=dict( type='SinePositionalEncoding', num_feats=512, normalize=True), transformer_decoder=dict( type='DetrTransformerDecoder', return_intermediate=True, num_layers=9, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=dict( type='MultiheadAttention', embed_dims=1024, num_heads=32, attn_drop=0.0, proj_drop=0.0, dropout_layer=None, batch_first=False), ffn_cfgs=dict( embed_dims=1024, feedforward_channels=4096, num_fcs=2, act_cfg=dict(type='ReLU', inplace=True), ffn_drop=0.0, dropout_layer=None, add_identity=True, with_cp=True), feedforward_channels=4096, operation_order=('cross_attn', 'norm', 'self_attn', 'norm', 'ffn', 'norm')), init_cfg=None), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=2.0, reduction='mean', class_weight=[ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.1 ]), loss_mask=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=5.0), loss_dice=dict( type='DiceLoss', use_sigmoid=True, activate=True, reduction='mean', naive_dice=True, eps=1.0, loss_weight=5.0)), train_cfg=dict( num_points=12544, oversample_ratio=3.0, importance_sample_ratio=0.75, assigner=dict( type='MaskHungarianAssigner', cls_cost=dict(type='ClassificationCost', weight=2.0), mask_cost=dict( type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True), dice_cost=dict( type='DiceCost', weight=5.0, pred_act=True, eps=1.0)), sampler=dict(type='MaskPseudoSampler')), test_cfg=dict( panoptic_on=True, semantic_on=False, instance_on=True, max_per_image=100, iou_thr=0.8, filter_low_score=True, mode='slide', crop_size=(896, 896), stride=(512, 512)), init_cfg=None) dataset_type = 'CityDataset' data_root = 'xxxx/workspace/dataset/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (896, 896) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(896, 896), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(896, 896), pad_val=0, seg_pad_val=255), dict(type='ToMask'), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='CityDataset', data_root='xxxx/workspace/dataset/', img_dir=[ 'cityscapes/images/train', ], ann_dir=[ 'cityscapes/gt/train' ], pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict( type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(896, 896), cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(896, 896), pad_val=0, seg_pad_val=255), dict(type='ToMask'), dict(type='DefaultFormatBundle'), dict( type='Collect', keys=['img', 'gt_semantic_seg', 'gt_masks', 'gt_labels']) ]), val=dict( type='CityDataset', data_root='xxxx/workspace/dataset/', img_dir='cityscapes/images/val', ann_dir='cityscapes/gt/val', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='CityDataset', data_root='xxxx/workspace/dataset/', img_dir='cityscapes/images/test', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 1024), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ])) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] cudnn_benchmark = True optimizer = dict( type='AdamW', lr=1e-05, betas=(0.9, 0.999), weight_decay=0.05, constructor='OnePeaceLearningRateDecayOptimizerConstructor', paramwise_cfg=dict(num_layers=40, decay_rate=0.95)) optimizer_config = dict() lr_config = dict( policy='poly', warmup='linear', warmup_iters=1500, warmup_ratio=1e-06, power=1.0, min_lr=0.0, by_epoch=False) runner = dict(type='IterBasedRunner', max_iters=80000) checkpoint_config = dict( by_epoch=False, interval=4000, max_keep_ckpts=3, create_symlink=False) evaluation = dict(interval=100, metric='mIoU', pre_eval=True, save_best='mIoU') pretrained = 'xxxx/workspace/code/cityscapes/ONE-PEACE-main/pretrained/one-peace-vision.pkl' work_dir = './work_dirs/city_40k_ss' gpu_ids = range(0, 8) auto_resume = False

simonJJJ commented 1 year ago

@RYHSmmc One diff I observe is the image normalization mean and std. You should set mean=[122.771, 116.746, 104.094], std=[68.5, 66.632, 70.323].

Also, make sure you are using the same library version as mentioned in README, which is mmcv-full==1.5.0 mmdet==2.22.0 mmsegmentation==0.30.0 timm==0.5.4

simonJJJ commented 1 year ago

Also, insert dict(type='ResizeToMultiple', size_divisor=32) between dict(type='Resize', keep_ratio=True) and dict(type='Normalize',mean=[123.675, 116.28, 103.53],std=[58.395, 57.12, 57.375],to_rgb=True) in val.pipeline.

RYHSmmc commented 1 year ago

okok, thanks!!