Open LLH-Harward opened 3 months ago
@wondervictor
My mistake, no problem, it can be inferred from the code. Just set the threshold in image_demo.py very low.
I fine-tuned a model using x-1280, and it can detect bounding boxes, but the confidence is very low. Do you have any solutions for this?
你好,请问你的grad-norm为0的问题是怎么解决的呀
如果您有有关提高置信度的方法 也请知会我 不胜感激
"我尝试了下第二种prompt tuning的方法,按照您博客中的方法,冻住了neck和head以及backbone的前两层,只修改了dataset 加入了metainfo中自己的类(如下),发现grad_norm前两个epoch为0.0001 后面都为0.0000,loss不降低。"我是从评论区过来找您的,我也遇到了这样的问题,请问第二种方式prompt tuning您是怎么解决的loss正常,但grad-norm为0
抱歉哈 我的prompt tuning也不奏效 我评论区写的是gradnorm为0 loss不降低
发现grad_norm前两个epoch为0.0001 后面都为0.0000,loss不降低。
好的,谢谢您,我再研究研究
建议 repa tuning或者Ultralytics版本
大佬帮忙看一下为啥我repa tuning也是这样呀,就是grad-norm为0,然后loss不下降,第二种微调方式也这样
有更多信息吗? 比如10个epoch的log和config
---Original--- From: @.> Date: Fri, Sep 13, 2024 15:36 PM To: @.>; Cc: "Liu @.>;"State @.>; Subject: Re: [AILab-CVC/YOLO-World] Fine-tuning problem Issue (Issue #457)
bug.png (view on web)大佬帮忙看一下为啥我repa tuning也是这样呀,就是grad-norm为0,然后loss不下降,第二种微调方式也这样
— Reply to this email directly, view it on GitHub, or unsubscribe. You are receiving this because you modified the open/close state.Message ID: @.***>
base = ('../../third_party/mmyolo/configs/yolov8/' 'yolov8_s_mask-refine_syncbn_fast_8xb16-500e_coco.py') custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)
num_classes = 8 num_training_classes = 8 max_epochs = 300 # Maximum training epochs close_mosaic_epochs = 10 save_epoch_intervals = 5 text_channels = 512 neck_embed_channels = [128, 256, base.last_stage_out_channels // 2] neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32] base_lr = 2e-4 weight_decay = 0.05 train_batch_size_per_gpu = 4 load_from = 'weights/yolo_world_v2_s_obj365v1_goldg_pretrain-55b943ea_rep_conv.pth' persistent_workers = False mixup_prob = 0.15 copypaste_prob = 0.3 classes = [["rottenSurface"], ["crease"], ["growthLines"], ["healingInjury"], ["cavity"], ["bacterialInjury"], ["pinhole"], ["scratch"]]
model = dict(type='SimpleYOLOWorldDetector', mm_neck=True, num_train_classes=num_classes, num_test_classes=num_classes, reparameterized=True, data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'), backbone=dict(delete=True, type='MultiModalYOLOBackbone', text_model=None, image_model={{base.model.backbone}}, with_text_model=False), neck=dict(type='YOLOWorldPAFPN', guide_channels=num_classes, embed_channels=neck_embed_channels, num_heads=neck_num_heads, block_cfg=dict(type='RepConvMaxSigmoidCSPLayerWithTwoConv', guide_channels=num_classes)), bbox_head=dict(head_module=dict(type='RepYOLOWorldHeadModule', embed_dims=text_channels, num_guide=num_classes, num_classes=num_classes)), train_cfg=dict(assigner=dict(num_classes=num_classes)))
final_transform = [ dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 'flip_direction')) ] mosaic_affine_transform = [ dict(type='Mosaic', img_scale=base.img_scale, pad_val=114.0, pre_transform=base.pre_transform), dict(type='YOLOv5CopyPaste', prob=copypaste_prob), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, max_aspect_ratio=100., scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale),
border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
border_val=(114, 114, 114),
min_area_ratio=_base_.min_area_ratio,
use_mask_refine=_base_.use_mask2refine)
] train_pipeline = [ base.pre_transform, mosaic_affine_transform, dict(type='YOLOv5MixUp', prob=mixup_prob, pre_transform=[base.pre_transform, mosaic_affine_transform]), base.last_transform[:-1], final_transform ]
train_pipeline_stage2 = [base.train_pipeline_stage2[:-1], final_transform]
coco_train_dataset = dict(type='YOLOv5CocoDataset', metainfo=dict(classes=classes), data_root='leather2017/', ann_file='train2017/instances_train2017.json', data_prefix=dict(img='train2017/images/'), filter_cfg=dict(filter_empty_gt=False, min_size=32), pipeline=train_pipeline)
train_dataloader = dict(persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset)
train_dataloader = dict(persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) test_pipeline = [ *base.test_pipeline[:-1], dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param')) ] coco_val_dataset = dict(type='YOLOv5CocoDataset', metainfo=dict(classes=classes), data_root='leather2017/', ann_file='val2017/instances_val2017.json', data_prefix=dict(img='val2017/images/'), filter_cfg=dict(filter_empty_gt=False, min_size=32), pipeline=test_pipeline)
val_dataloader = dict(dataset=coco_val_dataset) test_dataloader = val_dataloader
default_hooks = dict(param_scheduler=dict(scheduler_type='linear', lr_factor=0.01, max_epochs=max_epochs), checkpoint=dict(max_keep_ckpts=-1, save_best=None, interval=save_epoch_intervals)) custom_hooks = [ dict(type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0001, update_buffers=True, strict_load=False, priority=49), dict(type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, switch_pipeline=train_pipeline_stage2) ] train_cfg = dict(max_epochs=max_epochs, val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), base.val_interval_stage2)]) optim_wrapper = dict(optimizer=dict( delete=True, type='AdamW', lr=base_lr, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), constructor='YOLOWv5OptimizerConstructor')
val_evaluator = dict(delete=True, type='mmdet.CocoMetric', proposal_nums=(100, 1, 10), ann_file='leather2017/val2017/instances_val2017.json', metric='bbox') 这是config文件,大佬帮忙看一下
或者说是直接用那个class_text.json文件直接生成的,然后文件内容是[[ "rottenSurface" ], [ "crease" ], [ "growthLines" ], [ "healingInjury" ], [ "cavity" ], [ "bacterialInjury" ], [ "pinhole" ], [ "scratch" ]],是文件格式有问题导致生成的npy文件有问题吗
After fine-tuning, the detector fails to detect objects,the Output Image Has No Detected Boxes, even though the loss keeps decreasing during training. I want to know whether the issue lies with my method or my data. Could anyone help me? Thank you.
Inference Command Used After Training: python .\image_demo.py D:\YOLO-World-master\configs\pretrain\custom_yolo_world_l_clip.py D:\YOLO-World-master\log_200\epoch_220.pth D:\YOLO-World-master\datasets\images\train\0000001 .jpg "Book"
metainfo metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes'))
class_text_path: [ ["Chalk"], ["Microphone"], ["MobilePhone"], ["Tablet"], ["OtherTeachingTools"], ["Book"], ["Pen"], ["RulerTools"], ["Eraser"], ["PencilCase"], ["Laptop"], ["NonEducationalItems"], ["BlackboardWriting"], ["Notes"] ]
Config file: base = ( '../../third_party/mmyolo/configs/yolov8/' 'yolov8_l_syncbn_fast_8xb16-500e_coco.py') custom_imports = dict( imports=['yolo_world'], allow_failed_imports=False)
import os os.chdir('D:/YOLO-World-master') hyper-parameters num_classes = 14 num_training_classes = 14 max_epochs = 500 # Maximum training epochs close_mosaic_epochs = 10 save_epoch_intervals = 5 text_channels = 512 neck_embed_channels = [128, 256, base.last_stage_out_channels // 2] neck_num_heads = [4, 8, base.last_stage_out_channels // 2 // 32] base_lr = 1e-4 weight_decay = 0.05 train_batch_size_per_gpu = 16 load_from='pretrained_weights/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_train_pretrained-0e566235.pth' persistent_workers = False
model settings model = dict( type='YOLOWorldDetector', mm_neck=True, num_train_classes=num_training_classes, num_test_classes=num_classes, data_preprocessor=dict(type='YOLOWDetDataPreprocessor'), backbone=dict( delete=True, type='MultiModalYOLOBackbone', image_model={{base.model.backbone}}, text_model=dict( type='HuggingCLIPLanguageBackbone', model_name='openai/clip-vit-base-patch32', frozen_modules=['all'])), neck=dict(type='YOLOWorldDualPAFPN', guide_channels=text_channels, embed_channels=neck_embed_channels, num_heads=neck_num_heads, block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv'), text_enhancder=dict(type='ImagePoolingAttentionModule', embed_channels=256, num_heads=8)), bbox_head=dict(type='YOLOWorldHead', head_module=dict(type='YOLOWorldHeadModule', embed_dims=text_channels, num_classes=num_training_classes)), train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
dataset settings text_transform = [ dict(type='RandomLoadText', num_neg_samples=(num_classes, num_classes), max_num_samples=num_training_classes, padding_to_max=True, padding_value=''), dict(type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip', 'flip_direction', 'texts')) ] mosaic_affine_transform = [ dict( type='MultiModalMosaic', img_scale=base.img_scale, pad_val=114.0, pre_transform=base.pre_transform), dict( type='YOLOv5RandomAffine', max_rotate_degree=0.0, max_shear_degree=0.0, max_aspect_ratio=100., scaling_ratio_range=(1 - base.affine_scale, 1 + base.affine_scale),
img_scale is (width, height)
] train_pipeline = [ base.pre_transform, mosaic_affine_transform, dict( type='YOLOv5MultiModalMixUp', prob=base.mixup_prob, pre_transform=[base.pre_transform, mosaic_affine_transform]), base.last_transform[:-1], text_transform ] train_pipeline_stage2 = [ base.train_pipeline_stage2[:-1], text_transform ]
metainfo = dict(classes = ('Chalk','Microphone','MobilePhone','Tablet','OtherTeachingTools',"Book",'Pen','RulerTools','Eraser','PencilCase','Laptop','NonEducationalItems','BlackboardWriting','Notes')) coco_train_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets', ann_file='annotations/train.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=train_pipeline)
train_dataloader = dict( persistent_workers=persistent_workers, batch_size=train_batch_size_per_gpu, collate_fn=dict(type='yolow_collate'), dataset=coco_train_dataset) test_pipeline = [ *base.test_pipeline[:-1], dict(type='LoadText'), dict( type='mmdet.PackDetInputs', meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'scale_factor', 'pad_param', 'texts')) ] coco_val_dataset = dict( delete=True, type='MultiModalDataset', dataset=dict( type='YOLOv5CocoDataset', metainfo = metainfo, data_root='datasets/', ann_file='annotations/val.json', data_prefix=dict(img='images'), filter_cfg=dict(filter_empty_gt=False, min_size=32)), class_text_path='data/texts/custom.json', pipeline=test_pipeline) val_dataloader = dict(dataset=coco_val_dataset) test_dataloader = val_dataloader training settings default_hooks = dict( param_scheduler=dict( scheduler_type='linear', lr_factor=0.01, max_epochs=max_epochs), checkpoint=dict( max_keep_ckpts=-1, save_best=None, interval=save_epoch_intervals)) custom_hooks = [ dict( type='EMAHook', ema_type='ExpMomentumEMA', momentum=0.0001, update_buffers=True, strict_load=False, priority=49), dict( type='mmdet.PipelineSwitchHook', switch_epoch=max_epochs - close_mosaic_epochs, switch_pipeline=train_pipeline_stage2) ] train_cfg = dict( max_epochs=max_epochs, val_interval=5, dynamic_intervals=[((max_epochs - close_mosaic_epochs), base.val_interval_stage2)]) optim_wrapper = dict( optimizer=dict( delete=True, type='AdamW', lr=base_lr, weight_decay=weight_decay, batch_size_per_gpu=train_batch_size_per_gpu), paramwise_cfg=dict( custom_keys={'backbone.text_model': dict(lr_mult=0.01), 'logit_scale': dict(weight_decay=0.0)}), constructor='YOLOWv5OptimizerConstructor')
evaluation settings val_evaluator = dict( delete=True, type='mmdet.CocoMetric', proposal_nums=(100, 1, 10), ann_file='datasets/annotations/val.json', metric='bbox')
datasets:
train.json