Open hhitdata opened 1 week ago
用7000多张的数据训练这个模型可以收敛,但换了十万多的不同类别的数据训练12个epoch后推理结果都为空,loss一直在80左右,下面是配置文件,4张3090的卡,想问下是哪里的问题。
dataset_type = 'CocoDataset' data_root = '/mnt/disk3/Co-DETR/dataset/wxcl/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='AutoAugment', policies=[[{ 'type': 'Resize', 'img_scale': [(480, 1280), (512, 1280), (544, 1280), (576, 1280), (608, 1280), (640, 1280), (672, 1280), (704, 1280), (736, 1280), (768, 1280), (800, 1280)], 'multiscale_mode': 'value', 'keep_ratio': True }], [{ 'type': 'Resize', 'img_scale': [(400, 4200), (500, 4200), (600, 4200)], 'multiscale_mode': 'value', 'keep_ratio': True }, { 'type': 'RandomCrop', 'crop_type': 'absolute_range', 'crop_size': (384, 600), 'allow_negative_crop': True }, { 'type': 'Resize', 'img_scale': [(480, 1280), (512, 1280), (544, 1280), (576, 1280), (608, 1280), (640, 1280), (672, 1280), (704, 1280), (736, 1280), (768, 1280), (800, 1280)], 'multiscale_mode': 'value', 'override': True, 'keep_ratio': True }]]), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=1), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1280, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=1), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=1, workers_per_gpu=1, train=dict( type='CocoDataset', ann_file='/mnt/disk3/Co-DETR/dataset/wxcl/Annotations/train.json', img_prefix='/mnt/disk3/Co-DETR/dataset/wxcl/Images/', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='AutoAugment', policies=[[{ 'type': 'Resize', 'img_scale': [(480, 1280), (512, 1280), (544, 1280), (576, 1280), (608, 1280), (640, 1280), (672, 1280), (704, 1280), (736, 1280), (768, 1280), (800, 1280)], 'multiscale_mode': 'value', 'keep_ratio': True }], [{ 'type': 'Resize', 'img_scale': [(400, 4200), (500, 4200), (600, 4200)], 'multiscale_mode': 'value', 'keep_ratio': True }, { 'type': 'RandomCrop', 'crop_type': 'absolute_range', 'crop_size': (384, 600), 'allow_negative_crop': True }, { 'type': 'Resize', 'img_scale': [(480, 1280), (512, 1280), (544, 1280), (576, 1280), (608, 1280), (640, 1280), (672, 1280), (704, 1280), (736, 1280), (768, 1280), (800, 1280)], 'multiscale_mode': 'value', 'override': True, 'keep_ratio': True }]]), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=1), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ], filter_empty_gt=False), val=dict( type='CocoDataset', ann_file= '/mnt/disk3/Co-DETR/dataset/wxcl/realistic_application_2/val.json', img_prefix='/mnt/disk3/Co-DETR/dataset/wxcl/realistic_application_2/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1280, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=1), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='CocoDataset', ann_file= '/mnt/disk3/Co-DETR/dataset/wxcl/realistic_application_2/val.json', img_prefix='/mnt/disk3/Co-DETR/dataset/wxcl/realistic_application_2/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1280, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=1), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ])) evaluation = dict(interval=1, metric='bbox') checkpoint_config = dict(interval=1) log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) custom_hooks = [dict(type='NumClassCheckHook')] dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] opencv_num_threads = 2 mp_start_method = 'fork' auto_scale_lr = dict(enable=False, base_batch_size=16) num_dec_layer = 4 lambda_2 = 2.0 model = dict( type='CoDETR', backbone=dict( type='SwinTransformerV1', embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], out_indices=(0, 1, 2, 3), window_size=12, ape=False, drop_path_rate=0.3, patch_norm=True, use_checkpoint=False, pretrained= '/mnt/disk3/Co-DETR/models/co_dino_5scale_swin_large_3x_coco.pth'), neck=dict( type='ChannelMapper', in_channels=[192, 384, 768, 1536], kernel_size=1, out_channels=256, act_cfg=None, norm_cfg=dict(type='GN', num_groups=32), num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', octave_base_scale=4, scales_per_octave=3, ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64, 128]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=12.0), loss_bbox=dict(type='L1Loss', loss_weight=12.0)), query_head=dict( type='CoDINOHead', num_query=900, num_classes=8, num_feature_levels=5, in_channels=2048, sync_cls_avg_factor=True, as_two_stage=True, with_box_refine=True, mixed_selection=True, dn_cfg=dict( type='CdnQueryGenerator', noise_scale=dict(label=0.5, box=1.0), group_cfg=dict(dynamic=True, num_groups=None, num_dn_queries=100)), transformer=dict( type='CoDinoTransformer', with_pos_coord=True, with_coord_feat=False, num_co_heads=2, num_feature_levels=5, encoder=dict( type='DetrTransformerEncoder', num_layers=6, with_cp=6, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='MultiScaleDeformableAttention', embed_dims=256, num_levels=5, dropout=0.0), feedforward_channels=2048, ffn_dropout=0.0, operation_order=('self_attn', 'norm', 'ffn', 'norm'))), decoder=dict( type='DinoTransformerDecoder', num_layers=6, return_intermediate=True, transformerlayers=dict( type='DetrTransformerDecoderLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=256, num_heads=8, dropout=0.0), dict( type='MultiScaleDeformableAttention', embed_dims=256, num_levels=5, dropout=0.0) ], feedforward_channels=2048, ffn_dropout=0.0, operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')))), positional_encoding=dict( type='SinePositionalEncoding', num_feats=128, temperature=20, normalize=True), loss_cls=dict( type='QualityFocalLoss', use_sigmoid=True, beta=2.0, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0)), roi_head=[ dict( type='CoStandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict( type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32, 64], finest_scale=56), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=8, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, reg_decoded_bbox=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=12.0), loss_bbox=dict(type='GIoULoss', loss_weight=120.0))) ], bbox_head=[ dict( type='CoATSSHead', num_classes=8, in_channels=256, stacked_convs=1, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', ratios=[1.0], octave_base_scale=8, scales_per_octave=1, strides=[4, 8, 16, 32, 64, 128]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2]), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=12.0), loss_bbox=dict(type='GIoULoss', loss_weight=24.0), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=12.0)) ], train_cfg=[ dict( assigner=dict( type='HungarianAssigner', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict( type='BBoxL1Cost', weight=5.0, box_format='xywh'), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=2.0))), dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=4000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)), dict( assigner=dict(type='ATSSAssigner', topk=9), allowed_border=-1, pos_weight=-1, debug=False) ], test_cfg=[ dict(max_per_img=300, nms=dict(type='soft_nms', iou_threshold=0.8)), dict( rpn=dict( nms_pre=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.0, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100)), dict( nms_pre=1000, min_bbox_size=0, score_thr=0.0, nms=dict(type='nms', iou_threshold=0.6), max_per_img=100) ]) optimizer = dict( type='AdamW', lr=0.0004, weight_decay=0.0001, paramwise_cfg=dict(custom_keys=dict(backbone=dict(lr_mult=0.1)))) optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2)) lr_config = dict(policy='step', step=[9]) runner = dict(type='EpochBasedRunner', max_epochs=12) pretrained = '/mnt/disk3/Co-DETR/models/co_dino_5scale_swin_large_3x_coco.pth' work_dir = './run3' auto_resume = False gpu_ids = range(0, 4)
如果用mmdetection里的faster-rcnn来训会收敛吗。如果还是不收敛,那要么是你的十万多数据噪音比较大,要么是自定义数据集那部分不对
之前这批数据用yolo5跑过是可以收敛的,我再看看自定义数据集的设置
用7000多张的数据训练这个模型可以收敛,但换了十万多的不同类别的数据训练12个epoch后推理结果都为空,loss一直在80左右,下面是配置文件,4张3090的卡,想问下是哪里的问题。