AILab-CVC / YOLO-World

[CVPR 2024] Real-Time Open-Vocabulary Object Detection
https://www.yoloworld.cc
GNU General Public License v3.0
4.74k stars 460 forks source link

Fine-tuning problem Issue #457

Open LLH-Harward opened 3 months ago

LLH-Harward commented 3 months ago

After fine-tuning, the detector fails to detect objects,the Output Image Has No Detected Boxes, even though the loss keeps decreasing during training. I want to know whether the issue lies with my method or my data. Could anyone help me? Thank you. image

Inference Command Used After Training: python .\image_demo.py D:\YOLO-World-master\configs\pretrain\custom_yolo_world_l_clip.py D:\YOLO-World-master\log_200\epoch_220.pth D:\YOLO-World-master\datasets\images\train\0000001 .jpg "Book"

metainfo metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes'))

class_text_path: [ ["Chalk"], ["Microphone"], ["MobilePhone"], ["Tablet"], ["OtherTeachingTools"], ["Book"], ["Pen"], ["RulerTools"], ["Eraser"], ["PencilCase"], ["Laptop"], ["NonEducationalItems"], ["BlackboardWriting"], ["Notes"] ]

Config file: base = ( '../../third_party/mmyolo/configs/yolov8/' 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') custom_imports = dict( imports=['yolo_world'], allow_failed_imports=False)

import os os.chdir('D:/YOLO-World-master') hyper-parameters num_classes = 14 num_training_classes = 14 max_epochs = 500 # Maximum training epochs close_mosaic_epochs = 10 save_epoch_intervals = 5 text_channels = 512 neck_embed_channels = [128, 256, base.last_stage_out_channels // 2] neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32] base_lr = 1e-4 weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from='pretrained_weights/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' persistent_workers = False

model settings model = dict( type='YOLOWorldDetector', mm_neck=True, num_train_classes=num_training_classes, num_test_classes=num_classes, data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), backbone=dict( delete=True, type='MultiModalYOLOBackbone', image_model={{base.model.backbone}}, text_model=dict( type='HuggingCLIPLanguageBackbone', model_name='openai/clip-vit-base-patch32', frozen_modules=['all'])), neck=dict(type='YOLOWorldDualPAFPN', guide_channels=text_channels, embed_channels=neck_embed_channels, num_heads=neck_num_heads, block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), text_enhancder=dict(type='ImagePoolingAttentionModule', embed_channels=256, num_heads=8)), bbox_head=dict(type='YOLOWorldHead', head_module=dict(type='YOLOWorldHeadModule', embed_dims=text_channels, num_classes=num_training_classes)), train_cfg=dict(assigner=dict(num_classes=num_training_classes)))

dataset settings text_transform = [ dict(type='RandomLoadText', num_neg_samples=(num_classes, num_classes), max_num_samples=num_training_classes, padding_to_max=True, padding_value=''), dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 'flip_direction', 'texts')) ] mosaic_affine_transform = [ dict( type='MultiModalMosaic', img_scale=base.img_scale, pad_val=114.0, pre_transform=base.pre_transform), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, max_aspect_ratio=100., scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale),

img_scale is (width, height)

    border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
    border_val=(114, 114, 114))

] train_pipeline = [ base.pre_transform, mosaic_affine_transform, dict( type='YOLOv5MultiModalMixUp', prob=base.mixup_prob, pre_transform=[base.pre_transform, mosaic_affine_transform]), base.last_transform[:-1], text_transform ] train_pipeline_stage2 = [ base.train_pipeline_stage2[:-1], text_transform ]

metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes')) coco_train_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets', ann_file='annotations/train.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=train_pipeline)

train_dataloader = dict( persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) test_pipeline = [ *base.test_pipeline[:-1], dict(type='LoadText'), dict( type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param', 'texts')) ] coco_val_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets/', ann_file='annotations/val.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=test_pipeline) val_dataloader = dict(dataset=coco_val_dataset) test_dataloader = val_dataloader training settings default_hooks = dict( param_scheduler=dict( scheduler_type='linear', lr_factor=0.01, max_epochs=max_epochs), checkpoint=dict( max_keep_ckpts=-1, save_best=None, interval=save_epoch_intervals)) custom_hooks = [ dict( type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0001, update_buffers=True, strict_load=False, priority=49), dict( type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, switch_pipeline=train_pipeline_stage2) ] train_cfg = dict( max_epochs=max_epochs, val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), base.val_interval_stage2)]) optim_wrapper = dict( optimizer=dict( delete=True, type='AdamW', lr=base_lr, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor')

evaluation settings val_evaluator = dict( delete=True, type='mmdet.CocoMetric', proposal_nums=(100, 1, 10), ann_file='datasets/annotations/val.json', metric='bbox')

datasets: image

train.json

{
  "images": [
    {
      "file_name": "val\\0000002.jpg",
      "id": 0,
      "width": 1920,
      "height": 1080
    },
    ...
   ]
    "annotations": [
    {
      "image_id": 0,
      "id": 0,
      "category_id": 9,
      "bbox": [
        342.47200000000004,
        610.78652,
        95.72999999999996,
        72.80948000000001
      ],
      "area": 6970.051520399998,
      "segmentation": [
        [
          342.47200000000004,
          610.78652,
          438.202,
          610.78652,
          438.202,
          683.596,
          342.47200000000004,
          683.596
        ]
      ],
      "iscrowd": 0
    },
    {
      "image_id": 1,
      "id": 1,
      "category_id": 9,
      "bbox": [
        542.02231,
        690.3370000000001,
        115.95522000000005,
        76.85399999999993
      ],
      "area": 8911.622477879995,
      "segmentation": [
        [
          542.02231,
          690.3370000000001,
          657.97753,
          690.3370000000001,
          657.97753,
          767.191,
          542.02231,
          767.191
        ]
      ],
      "iscrowd": 0
    },
   ...
   ]
 "categories": [
    {
      "id": 0,
      "name": "Chalk"
    },
    {
      "id": 1,
      "name": "Microphone"
    },
    {
      "id": 2,
      "name": "MobilePhone"
    },
    {
      "id": 3,
      "name": "Tablet"
    },
    {
      "id": 4,
      "name": "OtherTeachingTools"
    },
    {
      "id": 5,
      "name": "Book"
    },
    {
      "id": 6,
      "name": "Pen"
    },
    {
      "id": 7,
      "name": "RulerTools"
    },
    {
      "id": 8,
      "name": "Eraser"
    },
    {
      "id": 9,
      "name": "PencilCase"
    },
    {
      "id": 10,
      "name": "Laptop"
    },
    {
      "id": 11,
      "name": "NonEducationalItems"
    },
    {
      "id": 12,
      "name": "BlackboardWriting"
    },
    {
      "id": 13,
      "name": "Notes"
    }
  ]
}
LLH-Harward commented 3 months ago

@wondervictor

LLH-Harward commented 3 months ago

My mistake, no problem, it can be inferred from the code. Just set the threshold in image_demo.py very low.

LLH-Harward commented 3 months ago

I fine-tuned a model using x-1280, and it can detect bounding boxes, but the confidence is very low. Do you have any solutions for this? image

2879982985 commented 2 months ago

你好,请问你的grad-norm为0的问题是怎么解决的呀

LLH-Harward commented 2 months ago

https://blog.csdn.net/ITdaka/article/details/139306984#comments_34204896

https://blog.csdn.net/weixin_44878336/article/details/140177504#comments_34278560 可以参考这两个看一下

LLH-Harward commented 2 months ago

如果您有有关提高置信度的方法 也请知会我 不胜感激

2879982985 commented 2 months ago

"我尝试了下第二种prompt tuning的方法,按照您博客中的方法,冻住了neck和head以及backbone的前两层,只修改了dataset 加入了metainfo中自己的类(如下),发现grad_norm前两个epoch为0.0001 后面都为0.0000,loss不降低。"我是从评论区过来找您的,我也遇到了这样的问题,请问第二种方式prompt tuning您是怎么解决的loss正常,但grad-norm为0

LLH-Harward commented 2 months ago

抱歉哈 我的prompt tuning也不奏效 我评论区写的是gradnorm为0 loss不降低 

发现grad_norm前两个epoch为0.0001 后面都为0.0000,loss不降低。

2879982985 commented 2 months ago

好的,谢谢您,我再研究研究

LLH-Harward commented 2 months ago

建议 repa tuning或者Ultralytics版本

2879982985 commented 2 months ago

bug大佬帮忙看一下为啥我repa tuning也是这样呀,就是grad-norm为0,然后loss不下降,第二种微调方式也这样

LLH-Harward commented 2 months ago

有更多信息吗? 比如10个epoch的log和config

---Original--- From: @.> Date: Fri, Sep 13, 2024 15:36 PM To: @.>; Cc: "Liu @.>;"State @.>; Subject: Re: [AILab-CVC/YOLO-World] Fine-tuning problem Issue (Issue #457)

bug.png (view on web)大佬帮忙看一下为啥我repa tuning也是这样呀,就是grad-norm为0,然后loss不下降,第二种微调方式也这样

— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you modified the open/close state.Message ID: @.***>

2879982985 commented 2 months ago

bug base = ('../../third_party/mmyolo/configs/yolov8/' 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)

hyper-parameters

num_classes = 8 num_training_classes = 8 max_epochs = 300 # Maximum training epochs close_mosaic_epochs = 10 save_epoch_intervals = 5 text_channels = 512 neck_embed_channels = [128, 256, base.last_stage_out_channels // 2] neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32] base_lr = 2e-4 weight_decay = 0.05 train_batch_size_per_gpu = 4 load_from = 'weights/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea_rep_conv.pth' persistent_workers = False mixup_prob = 0.15 copypaste_prob = 0.3 classes = [["rottenSurface"], ["crease"], ["growthLines"], ["healingInjury"], ["cavity"], ["bacterialInjury"], ["pinhole"], ["scratch"]]

model settings

model = dict(type='SimpleYOLOWorldDetector', mm_neck=True, num_train_classes=num_classes, num_test_classes=num_classes, reparameterized=True, data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), backbone=dict(delete=True, type='MultiModalYOLOBackbone', text_model=None, image_model={{base.model.backbone}}, with_text_model=False), neck=dict(type='YOLOWorldPAFPN', guide_channels=num_classes, embed_channels=neck_embed_channels, num_heads=neck_num_heads, block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv', guide_channels=num_classes)), bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', embed_dims=text_channels, num_guide=num_classes, num_classes=num_classes)), train_cfg=dict(assigner=dict(num_classes=num_classes)))

dataset settings

final_transform = [ dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 'flip_direction')) ] mosaic_affine_transform = [ dict(type='Mosaic', img_scale=base.img_scale, pad_val=114.0, pre_transform=base.pre_transform), dict(type='YOLOv5CopyPaste', prob=copypaste_prob), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, max_aspect_ratio=100., scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale),

img_scale is (width, height)

    border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
    border_val=(114, 114, 114),
    min_area_ratio=_base_.min_area_ratio,
    use_mask_refine=_base_.use_mask2refine)

] train_pipeline = [ base.pre_transform, mosaic_affine_transform, dict(type='YOLOv5MixUp', prob=mixup_prob, pre_transform=[base.pre_transform, mosaic_affine_transform]), base.last_transform[:-1], final_transform ]

train_pipeline_stage2 = [base.train_pipeline_stage2[:-1], final_transform]

coco_train_dataset = dict(type='YOLOv5CocoDataset', metainfo=dict(classes=classes), data_root='leather2017/', ann_file='train2017/instances_train2017.json', data_prefix=dict(img='train2017/images/'), filter_cfg=dict(filter_empty_gt=False, min_size=32), pipeline=train_pipeline)

train_dataloader = dict(persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset)

train_dataloader = dict(persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) test_pipeline = [ *base.test_pipeline[:-1], dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param')) ] coco_val_dataset = dict(type='YOLOv5CocoDataset', metainfo=dict(classes=classes), data_root='leather2017/', ann_file='val2017/instances_val2017.json', data_prefix=dict(img='val2017/images/'), filter_cfg=dict(filter_empty_gt=False, min_size=32), pipeline=test_pipeline)

val_dataloader = dict(dataset=coco_val_dataset) test_dataloader = val_dataloader

training settings

default_hooks = dict(param_scheduler=dict(scheduler_type='linear', lr_factor=0.01, max_epochs=max_epochs), checkpoint=dict(max_keep_ckpts=-1, save_best=None, interval=save_epoch_intervals)) custom_hooks = [ dict(type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0001, update_buffers=True, strict_load=False, priority=49), dict(type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, switch_pipeline=train_pipeline_stage2) ] train_cfg = dict(max_epochs=max_epochs, val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), base.val_interval_stage2)]) optim_wrapper = dict(optimizer=dict( delete=True, type='AdamW', lr=base_lr, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), constructor='YOLOWv5OptimizerConstructor')

evaluation settings

val_evaluator = dict(delete=True, type='mmdet.CocoMetric', proposal_nums=(100, 1, 10), ann_file='leather2017/val2017/instances_val2017.json', metric='bbox') 这是config文件,大佬帮忙看一下

2879982985 commented 2 months ago

或者说是直接用那个class_text.json文件直接生成的,然后文件内容是[[ "rottenSurface" ], [ "crease" ], [ "growthLines" ], [ "healingInjury" ], [ "cavity" ], [ "bacterialInjury" ], [ "pinhole" ], [ "scratch" ]],是文件格式有问题导致生成的npy文件有问题吗