AILab-CVC / YOLO-World

[CVPR 2024] Real-Time Open-Vocabulary Object Detection
https://www.yoloworld.cc
GNU General Public License v3.0
4.46k stars 436 forks source link

Failed to finetune on custom coco-format dataset when using prompt-tuning. #276

Open lin-whale opened 5 months ago

lin-whale commented 5 months ago

Training is successful but the preformance is not improved, and validation phase failed. The config is below, which is modified from configs/prompt_tuning_coco/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.py

_base_ = ('../../third_party/mmyolo/configs/yolov8/'
          'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)

# hyper-parameters
num_classes = 50
num_training_classes = 50
max_epochs = 100  # Maximum training epochs
close_mosaic_epochs = 10
save_epoch_intervals = 5
text_channels = 512
neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
base_lr = 2e-3
weight_decay = 0.05
train_batch_size_per_gpu = 16
# load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
text_model_name = 'openai/clip-vit-base-patch32'
persistent_workers = False

# model settings
model = dict(type='YOLOWorldPromptDetector',
             mm_neck=True,
             num_train_classes=num_training_classes,
             num_test_classes=num_classes,
             embedding_path='embeddings/hospital_50.npy',
             prompt_dim=text_channels,
             num_prompts=50,
             data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
             backbone=dict(_delete_=True,
                           type='MultiModalYOLOBackbone',
                           text_model=None,
                           image_model={{_base_.model.backbone}},
                           frozen_stages=4,
                           with_text_model=False),
             neck=dict(type='YOLOWorldPAFPN',
                       freeze_all=True,
                       guide_channels=text_channels,
                       embed_channels=neck_embed_channels,
                       num_heads=neck_num_heads,
                       block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
             bbox_head=dict(type='YOLOWorldHead',
                            head_module=dict(
                                type='YOLOWorldHeadModule',
                                freeze_all=True,
                                use_bn_head=True,
                                embed_dims=text_channels,
                                num_classes=num_training_classes)),
             train_cfg=dict(assigner=dict(num_classes=num_training_classes)))

# dataset settings
final_transform = [
    dict(type='mmdet.PackDetInputs',
         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
                    'flip_direction'))
]
mosaic_affine_transform = [
    dict(type='Mosaic',
         img_scale=_base_.img_scale,
         pad_val=114.0,
         pre_transform=_base_.pre_transform),
    dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
    dict(
        type='YOLOv5RandomAffine',
        max_rotate_degree=0.0,
        max_shear_degree=0.0,
        max_aspect_ratio=100.,
        scaling_ratio_range=(1 - _base_.affine_scale, 1 + _base_.affine_scale),
        # img_scale is (width, height)
        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
        border_val=(114, 114, 114),
        min_area_ratio=_base_.min_area_ratio,
        use_mask_refine=_base_.use_mask2refine)
]
train_pipeline = [
    *_base_.pre_transform, *mosaic_affine_transform,
    dict(type='YOLOv5MixUp',
         prob=_base_.mixup_prob,
         pre_transform=[*_base_.pre_transform, *mosaic_affine_transform]),
    *_base_.last_transform[:-1], *final_transform
]

train_pipeline_stage2 = [*_base_.train_pipeline_stage2[:-1], *final_transform]
meta_info = dict(classes = ('floor', 'person', 'paper', 'bottle', 'paper cup', 'mask', 'thread', 'waiting bench', 'sturdy', 'plastic bag', 'table', 'packaging bag', 'door', 'carton box', 'sticker', 'screen', 'book', 'cotton ball', 'warning sign', 'rod', 'poster rack', 'vomit', 'blood', 'traffic cone', 'trash can', 'cart', 'rack', 'bag', 'flowerpot', 'medication', 'paper box', 'meal box', 'pericarp', 'hat', 'umbrella', 'drip stand', 'coffee stains', 'elevator entrance', 'escalator entrance', 'triage desk', 'registration machine', 'fire hydrant', 'hospital bed', 'milk stains', 'plinth', 'chair', 'wheel chair', 'swab', 'drinking cup', 'fallen leaves'))

coco_train_dataset = dict(type='YOLOv5CocoDataset',
                          metainfo = meta_info,
                          data_root='/data/cvat/train/2024-04-02-ann-cvat',
                          ann_file='/data/cvat/train/2024-04-02-ann-cvat/annotations/annotations.json.train',
                          data_prefix=dict(img='images/'),
                          filter_cfg=dict(filter_empty_gt=False, min_size=32),
                          pipeline=train_pipeline)

train_dataloader = dict(persistent_workers=persistent_workers,
                        batch_size=train_batch_size_per_gpu,
                        collate_fn=dict(type='yolow_collate'),
                        dataset=coco_train_dataset)

train_dataloader = dict(persistent_workers=persistent_workers,
                        batch_size=train_batch_size_per_gpu,
                        collate_fn=dict(type='yolow_collate'),
                        dataset=coco_train_dataset)
test_pipeline = [
    *_base_.test_pipeline[:-1],
    dict(type='mmdet.PackDetInputs',
         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                    'scale_factor', 'pad_param'))
]
coco_val_dataset = dict(type='YOLOv5CocoDataset',
                        metainfo = meta_info,
                        data_root='/data/cvat/train/2024-04-02-ann-cvat',
                        ann_file='/data/cvat/train/2024-04-02-ann-cvat/annotations/annotations.json.test',
                        data_prefix=dict(img='images/'),
                        filter_cfg=dict(filter_empty_gt=False, min_size=32),
                        pipeline=test_pipeline)

val_dataloader = dict(dataset=coco_val_dataset)
test_dataloader = val_dataloader
# training settings
default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
                                          lr_factor=0.01,
                                          max_epochs=max_epochs),
                     checkpoint=dict(max_keep_ckpts=-1,
                                     save_best="coco/bbox_mAP_50",
                                     interval=save_epoch_intervals))
custom_hooks = [
    dict(type='EMAHook',
         ema_type='ExpMomentumEMA',
         momentum=0.0001,
         update_buffers=True,
         strict_load=False,
         priority=49),
    dict(type='mmdet.PipelineSwitchHook',
         switch_epoch=max_epochs - close_mosaic_epochs,
         switch_pipeline=train_pipeline_stage2)
]
train_cfg = dict(max_epochs=max_epochs,
                 val_interval=5,
                 dynamic_intervals=[((max_epochs - close_mosaic_epochs),
                                     _base_.val_interval_stage2)])
optim_wrapper = dict(optimizer=dict(
    _delete_=True,
    type='AdamW',
    lr=base_lr,
    weight_decay=weight_decay,
    batch_size_per_gpu=train_batch_size_per_gpu),
                     paramwise_cfg=dict(bias_decay_mult=0.0,
                                        norm_decay_mult=0.0,
                                        custom_keys={
                                            'backbone.text_model':
                                            dict(lr_mult=0.01),
                                            'logit_scale':
                                            dict(weight_decay=0.0),
                                            'embeddings':
                                            dict(weight_decay=0.0)
                                        }),
                     constructor='YOLOWv5OptimizerConstructor')

# evaluation settings
val_evaluator = dict(_delete_=True,
                     type='mmdet.CocoMetric',
                     proposal_nums=(100, 1, 10),
                     ann_file='/data/cvat/train/2024-04-02-ann-cvat/annotations/annotations.json.test',
                     metric='bbox',
                     classwise=True)
find_unused_parameters = True
lin-whale commented 5 months ago

The training output:


]
use_mask2refine = True
val_ann_file = 'test/2024-03-28-ann-cvat/annotations/annotations.json'
val_batch_size_per_gpu = 1
val_cfg = dict(type='ValLoop')
val_data_prefix = 'test/2024-03-28-ann-cvat/images/'
val_dataloader = dict(
    batch_size=1,
    dataset=dict(
        ann_file=
        '/data/cvat/train/2024-04-02-ann-cvat/annotations/annotations.json.test',
        batch_shapes_cfg=None,
        data_prefix=dict(img='images/'),
        data_root='/data/cvat/train/2024-04-02-ann-cvat',
        filter_cfg=dict(filter_empty_gt=False, min_size=32),
        metainfo=dict(
            classes=(
                'floor',
                'person',
                'paper',
                'bottle',
                'paper cup',
                ...
            )),
        pipeline=[
            dict(backend_args=None, type='LoadImageFromFile'),
            dict(scale=(
                640,
                640,
            ), type='YOLOv5KeepRatioResize'),
            dict(
                allow_scale_up=False,
                pad_val=dict(img=114),
                scale=(
                    640,
                    640,
                ),
...
Done (t=0.20s)
creating index...
index created!
04/24 01:45:39 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
04/24 01:45:39 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
04/24 01:45:39 - mmengine - INFO - Checkpoints will be saved to /home/aistar/yolo-world/YOLO-World/work_dirs/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.
/opt/conda/lib/python3.10/site-packages/torch/functional.py:507: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /opt/conda/conda-bld/pytorch_1711403380909/work/aten/src/ATen/native/TensorShape.cpp:3549.)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
04/24 01:45:59 - mmengine - INFO - Epoch(train)   [1][ 50/290]  base_lr: 2.0000e-03 lr: 9.8000e-05  eta: 3:15:16  time: 0.4047  data_time: 0.0721  memory: 10524  grad_norm: 0.0000  loss: 209.2231  loss_cls: 85.0021  loss_bbox: 57.6584  loss_dfl: 66.5625
04/24 01:46:11 - mmengine - INFO - Epoch(train)   [1][100/290]  base_lr: 2.0000e-03 lr: 1.9800e-04  eta: 2:36:04  time: 0.2433  data_time: 0.0477  memory: 4760  grad_norm: 0.0000  loss: 208.6729  loss_cls: 84.9814  loss_bbox: 57.1290  loss_dfl: 66.5625
04/24 01:46:20 - mmengine - INFO - Epoch(train)   [1][150/290]  base_lr: 2.0000e-03 lr: 2.9800e-04  eta: 2:12:05  time: 0.1761  data_time: 0.0129  memory: 4706  grad_norm: 0.0000  loss: 208.2608  loss_cls: 84.9480  loss_bbox: 56.7503  loss_dfl: 66.5625
04/24 01:46:29 - mmengine - INFO - Epoch(train)   [1][200/290]  base_lr: 2.0000e-03 lr: 3.9800e-04  eta: 2:00:17  time: 0.1783  data_time: 0.0173  memory: 4880  grad_norm: 0.0000  loss: 208.5585  loss_cls: 84.8515  loss_bbox: 57.1445  loss_dfl: 66.5625
04/24 01:46:37 - mmengine - INFO - Epoch(train)   [1][250/290]  base_lr: 2.0000e-03 lr: 4.9800e-04  eta: 1:52:12  time: 0.1685  data_time: 0.0032  memory: 4640  grad_norm: 0.0000  loss: 208.4256  loss_cls: 84.9207  loss_bbox: 56.9424  loss_dfl: 66.5625
04/24 01:46:45 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:46:57 - mmengine - INFO - Epoch(train)   [2][ 50/290]  base_lr: 2.0000e-03 lr: 6.7129e-04  eta: 1:48:58  time: 0.2320  data_time: 0.0606  memory: 4920  grad_norm: 0.0000  loss: 208.7070  loss_cls: 85.1418  loss_bbox: 57.0027  loss_dfl: 66.5625
04/24 01:47:05 - mmengine - INFO - Epoch(train)   [2][100/290]  base_lr: 2.0000e-03 lr: 7.7030e-04  eta: 1:45:39  time: 0.1772  data_time: 0.0228  memory: 4640  grad_norm: 0.0000  loss: 209.1946  loss_cls: 85.3240  loss_bbox: 57.3081  loss_dfl: 66.5625
04/24 01:47:14 - mmengine - INFO - Epoch(train)   [2][150/290]  base_lr: 2.0000e-03 lr: 8.6931e-04  eta: 1:43:13  time: 0.1798  data_time: 0.0082  memory: 4706  grad_norm: 0.0000  loss: 208.6277  loss_cls: 85.0666  loss_bbox: 56.9986  loss_dfl: 66.5625
04/24 01:47:23 - mmengine - INFO - Epoch(train)   [2][200/290]  base_lr: 2.0000e-03 lr: 9.6832e-04  eta: 1:40:57  time: 0.1738  data_time: 0.0115  memory: 4560  grad_norm: 0.0000  loss: 209.1859  loss_cls: 85.0124  loss_bbox: 57.6110  loss_dfl: 66.5625
04/24 01:47:32 - mmengine - INFO - Epoch(train)   [2][250/290]  base_lr: 2.0000e-03 lr: 1.0673e-03  eta: 1:39:16  time: 0.1781  data_time: 0.0104  memory: 4520  grad_norm: 0.0000  loss: 208.7904  loss_cls: 85.1610  loss_bbox: 57.0668  loss_dfl: 66.5625
04/24 01:47:39 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:47:51 - mmengine - INFO - Epoch(train)   [3][ 50/290]  base_lr: 2.0000e-03 lr: 1.2331e-03  eta: 1:39:06  time: 0.2327  data_time: 0.0540  memory: 4760  grad_norm: 0.0000  loss: 209.1261  loss_cls: 85.0435  loss_bbox: 57.5202  loss_dfl: 66.5625
04/24 01:48:00 - mmengine - INFO - Epoch(train)   [3][100/290]  base_lr: 2.0000e-03 lr: 1.3311e-03  eta: 1:37:54  time: 0.1798  data_time: 0.0118  memory: 4560  grad_norm: 0.0000  loss: 208.6577  loss_cls: 84.9895  loss_bbox: 57.1057  loss_dfl: 66.5625
04/24 01:48:09 - mmengine - INFO - Epoch(train)   [3][150/290]  base_lr: 2.0000e-03 lr: 1.4291e-03  eta: 1:36:45  time: 0.1771  data_time: 0.0109  memory: 4813  grad_norm: 0.0000  loss: 208.6249  loss_cls: 84.9350  loss_bbox: 57.1273  loss_dfl: 66.5625
04/24 01:48:18 - mmengine - INFO - Epoch(train)   [3][200/290]  base_lr: 2.0000e-03 lr: 1.5272e-03  eta: 1:35:39  time: 0.1746  data_time: 0.0081  memory: 4507  grad_norm: 0.0000  loss: 208.9671  loss_cls: 85.0744  loss_bbox: 57.3302  loss_dfl: 66.5625
04/24 01:48:26 - mmengine - INFO - Epoch(train)   [3][250/290]  base_lr: 2.0000e-03 lr: 1.6252e-03  eta: 1:34:13  time: 0.1587  data_time: 0.0127  memory: 4666  grad_norm: 0.0000  loss: 208.3120  loss_cls: 84.8422  loss_bbox: 56.9073  loss_dfl: 66.5625
04/24 01:48:34 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:48:45 - mmengine - INFO - Epoch(train)   [4][ 50/290]  base_lr: 2.0000e-03 lr: 1.7834e-03  eta: 1:34:26  time: 0.2219  data_time: 0.0478  memory: 4733  grad_norm: 0.0000  loss: 208.5537  loss_cls: 84.9261  loss_bbox: 57.0651  loss_dfl: 66.5625
04/24 01:48:54 - mmengine - INFO - Epoch(train)   [4][100/290]  base_lr: 2.0000e-03 lr: 1.8804e-03  eta: 1:33:37  time: 0.1745  data_time: 0.0155  memory: 4666  grad_norm: 0.0000  loss: 208.6375  loss_cls: 85.1082  loss_bbox: 56.9668  loss_dfl: 66.5625
04/24 01:48:59 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:49:02 - mmengine - INFO - Epoch(train)   [4][150/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:32:50  time: 0.1735  data_time: 0.0153  memory: 4813  grad_norm: 0.0000  loss: 209.0560  loss_cls: 84.9764  loss_bbox: 57.5171  loss_dfl: 66.5625
04/24 01:49:11 - mmengine - INFO - Epoch(train)   [4][200/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:32:11  time: 0.1771  data_time: 0.0158  memory: 4493  grad_norm: 0.0000  loss: 208.8431  loss_cls: 84.9746  loss_bbox: 57.3060  loss_dfl: 66.5625
04/24 01:49:20 - mmengine - INFO - Epoch(train)   [4][250/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:31:45  time: 0.1847  data_time: 0.0158  memory: 4507  grad_norm: 0.0000  loss: 208.0752  loss_cls: 84.9316  loss_bbox: 56.5811  loss_dfl: 66.5625
04/24 01:49:27 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:49:38 - mmengine - INFO - Epoch(train)   [5][ 50/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:31:28  time: 0.2187  data_time: 0.0511  memory: 4466  grad_norm: 0.0000  loss: 208.7696  loss_cls: 85.0308  loss_bbox: 57.1764  loss_dfl: 66.5625
04/24 01:49:47 - mmengine - INFO - Epoch(train)   [5][100/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:30:50  time: 0.1719  data_time: 0.0072  memory: 4613  grad_norm: 0.0000  loss: 208.6188  loss_cls: 84.9198  loss_bbox: 57.1365  loss_dfl: 66.5625
04/24 01:49:57 - mmengine - INFO - Epoch(train)   [5][150/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:30:52  time: 0.2073  data_time: 0.0197  memory: 4840  grad_norm: 0.0000  loss: 209.2043  loss_cls: 85.1290  loss_bbox: 57.5128  loss_dfl: 66.5625
04/24 01:50:06 - mmengine - INFO - Epoch(train)   [5][200/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:30:23  time: 0.1782  data_time: 0.0112  memory: 4746  grad_norm: 0.0000  loss: 208.6433  loss_cls: 84.9482  loss_bbox: 57.1325  loss_dfl: 66.5625
04/24 01:50:15 - mmengine - INFO - Epoch(train)   [5][250/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:29:50  time: 0.1720  data_time: 0.0104  memory: 4480  grad_norm: 0.0000  loss: 209.0454  loss_cls: 85.0755  loss_bbox: 57.4073  loss_dfl: 66.5625
04/24 01:50:22 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:50:22 - mmengine - INFO - Saving checkpoint at 5 epochs
04/24 01:50:24 - mmengine - WARNING - `save_param_scheduler` is True but `self.param_schedulers` is None, so skip saving parameter schedulers
04/24 01:50:29 - mmengine - INFO - Epoch(val)   [5][  50/1160]    eta: 0:01:21  time: 0.0738  data_time: 0.0058  memory: 4533  
04/24 01:50:30 - mmengine - INFO - Epoch(val)   [5][ 100/1160]    eta: 0:00:56  time: 0.0323  data_time: 0.0016  memory: 596  
04/24 01:50:32 - mmengine - INFO - Epoch(val)   [5][ 150/1160]    eta: 0:00:45  time: 0.0303  data_time: 0.0006  memory: 596  
04/24 01:50:33 - mmengine - INFO - Epoch(val)   [5][ 200/1160]    eta: 0:00:39  time: 0.0281  data_time: 0.0005  memory: 596  
04/24 01:50:35 - mmengine - INFO - Epoch(val)   [5][ 250/1160]    eta: 0:00:34  time: 0.0263  data_time: 0.0012  memory: 596  
04/24 01:50:36 - mmengine - INFO - Epoch(val)   [5][ 300/1160]    eta: 0:00:32  time: 0.0369  data_time: 0.0007  memory: 596  
04/24 01:50:38 - mmengine - INFO - Epoch(val)   [5][ 350/1160]    eta: 0:00:29  time: 0.0250  data_time: 0.0005  memory: 596  
04/24 01:50:39 - mmengine - INFO - Epoch(val)   [5][ 400/1160]    eta: 0:00:26  time: 0.0295  data_time: 0.0009  memory: 596  
04/24 01:50:41 - mmengine - INFO - Epoch(val)   [5][ 450/1160]    eta: 0:00:24  time: 0.0288  data_time: 0.0006  memory: 596  
04/24 01:50:42 - mmengine - INFO - Epoch(val)   [5][ 500/1160]    eta: 0:00:22  time: 0.0240  data_time: 0.0009  memory: 596  
04/24 01:50:44 - mmengine - INFO - Epoch(val)   [5][ 550/1160]    eta: 0:00:20  time: 0.0361  data_time: 0.0011  memory: 596  
04/24 01:50:45 - mmengine - INFO - Epoch(val)   [5][ 600/1160]    eta: 0:00:18  time: 0.0281  data_time: 0.0009  memory: 596  
04/24 01:50:47 - mmengine - INFO - Epoch(val)   [5][ 650/1160]    eta: 0:00:16  time: 0.0318  data_time: 0.0012  memory: 596  
04/24 01:50:49 - mmengine - INFO - Epoch(val)   [5][ 700/1160]    eta: 0:00:15  time: 0.0380  data_time: 0.0012  memory: 596  
04/24 01:50:50 - mmengine - INFO - Epoch(val)   [5][ 750/1160]    eta: 0:00:13  time: 0.0320  data_time: 0.0007  memory: 596  
04/24 01:50:52 - mmengine - INFO - Epoch(val)   [5][ 800/1160]    eta: 0:00:11  time: 0.0265  data_time: 0.0008  memory: 596  
04/24 01:50:53 - mmengine - INFO - Epoch(val)   [5][ 850/1160]    eta: 0:00:10  time: 0.0225  data_time: 0.0012  memory: 596  
04/24 01:50:54 - mmengine - INFO - Epoch(val)   [5][ 900/1160]    eta: 0:00:08  time: 0.0330  data_time: 0.0015  memory: 596  
04/24 01:50:56 - mmengine - INFO - Epoch(val)   [5][ 950/1160]    eta: 0:00:06  time: 0.0290  data_time: 0.0009  memory: 596  
04/24 01:50:58 - mmengine - INFO - Epoch(val)   [5][1000/1160]    eta: 0:00:05  time: 0.0367  data_time: 0.0016  memory: 596  
04/24 01:50:59 - mmengine - INFO - Epoch(val)   [5][1050/1160]    eta: 0:00:03  time: 0.0285  data_time: 0.0007  memory: 596  
04/24 01:51:00 - mmengine - INFO - Epoch(val)   [5][1100/1160]    eta: 0:00:01  time: 0.0283  data_time: 0.0008  memory: 596  
04/24 01:51:02 - mmengine - INFO - Epoch(val)   [5][1150/1160]    eta: 0:00:00  time: 0.0357  data_time: 0.0015  memory: 596  
04/24 01:51:02 - mmengine - INFO - Evaluating bbox...
Loading and preparing results...
04/24 01:51:02 - mmengine - ERROR - /opt/conda/lib/python3.10/site-packages/mmdet/evaluation/metrics/coco_metric.py - compute_metrics - 469 - The testing results of the whole dataset is empty.
04/24 01:51:02 - mmengine - INFO - Epoch(val) [5][1160/1160]    data_time: 0.0012  time: 0.0321
04/24 01:51:02 - mmengine - WARNING - Since `metrics` is an empty dict, the behavior to save the best checkpoint will be skipped in this evaluation.
04/24 01:51:15 - mmengine - INFO - Epoch(train)   [6][ 50/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:30:04  time: 0.2459  data_time: 0.0610  memory: 4564  grad_norm: 0.0000  loss: 208.6450  loss_cls: 85.1488  loss_bbox: 56.9337  loss_dfl: 66.5625
04/24 01:51:24 - mmengine - INFO - Epoch(train)   [6][100/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:29:36  time: 0.1752  data_time: 0.0077  memory: 4577  grad_norm: 0.0000  loss: 208.5121  loss_cls: 84.7862  loss_bbox: 57.1634  loss_dfl: 66.5625
04/24 01:51:34 - mmengine - INFO - Epoch(train)   [6][150/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:29:31  time: 0.2024  data_time: 0.0244  memory: 4430  grad_norm: 0.0000  loss: 209.0030  loss_cls: 85.0959  loss_bbox: 57.3446  loss_dfl: 66.5625
04/24 01:51:42 - mmengine - INFO - Epoch(train)   [6][200/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:29:05  time: 0.1756  data_time: 0.0224  memory: 4671  grad_norm: 0.0000  loss: 208.4982  loss_cls: 84.8802  loss_bbox: 57.0555  loss_dfl: 66.5625
04/24 01:51:51 - mmengine - INFO - Epoch(train)   [6][250/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:28:40  time: 0.1770  data_time: 0.0078  memory: 4416  grad_norm: 0.0000  loss: 208.7319  loss_cls: 85.0956  loss_bbox: 57.0738  loss_dfl: 66.5625
04/24 01:51:59 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:52:11 - mmengine - INFO - Epoch(train)   [7][ 50/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:28:51  time: 0.2433  data_time: 0.0595  memory: 4643  grad_norm: 0.0000  loss: 209.1036  loss_cls: 85.1163  loss_bbox: 57.4247  loss_dfl: 66.5625
04/24 01:52:19 - mmengine - INFO - Epoch(train)   [7][100/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:28:24  time: 0.1728  data_time: 0.0014  memory: 4443  grad_norm: 0.0000  loss: 208.8363  loss_cls: 84.8555  loss_bbox: 57.4183  loss_dfl: 66.5625
04/24 01:52:28 - mmengine - INFO - Epoch(train)   [7][150/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:28:01  time: 0.1766  data_time: 0.0150  memory: 4643  grad_norm: 0.0000  loss: 209.2883  loss_cls: 85.2692  loss_bbox: 57.4565  loss_dfl: 66.5625
04/24 01:52:37 - mmengine - INFO - Epoch(train)   [7][200/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:27:38  time: 0.1756  data_time: 0.0093  memory: 4590  grad_norm: 0.0000  loss: 209.3806  loss_cls: 85.1305  loss_bbox: 57.6876  loss_dfl: 66.5625
04/24 01:52:46 - mmengine - INFO - Epoch(train)   [7][250/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:27:23  time: 0.1872  data_time: 0.0144  memory: 4656  grad_norm: 0.0000  loss: 209.1152  loss_cls: 84.9242  loss_bbox: 57.6285  loss_dfl: 66.5625
04/24 01:52:49 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:52:54 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:53:05 - mmengine - INFO - Epoch(train)   [8][ 50/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:27:13  time: 0.2192  data_time: 0.0511  memory: 4630  grad_norm: 0.0000  loss: 209.1739  loss_cls: 85.0660  loss_bbox: 57.5453  loss_dfl: 66.5625
04/24 01:53:15 - mmengine - INFO - Epoch(train)   [8][100/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:27:07  time: 0.2008  data_time: 0.0103  memory: 4443  grad_norm: 0.0000  loss: 208.7517  loss_cls: 85.0137  loss_bbox: 57.1755  loss_dfl: 66.5625
04/24 01:53:23 - mmengine - INFO - Epoch(train)   [8][150/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:26:44  time: 0.1729  data_time: 0.0097  memory: 4523  grad_norm: 0.0000  loss: 208.3930  loss_cls: 85.0247  loss_bbox: 56.8058  loss_dfl: 66.5625
04/24 01:53:32 - mmengine - INFO - Epoch(train)   [8][200/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:26:22  time: 0.1729  data_time: 0.0050  memory: 4910  grad_norm: 0.0000  loss: 208.6075  loss_cls: 85.0593  loss_bbox: 56.9856  loss_dfl: 66.5625
04/24 01:53:41 - mmengine - INFO - Epoch(train)   [8][250/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:26:00  time: 0.1724  data_time: 0.0069  memory: 4603  grad_norm: 0.0000  loss: 208.7036  loss_cls: 85.0302  loss_bbox: 57.1108  loss_dfl: 66.5625
04/24 01:53:48 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:54:00 - mmengine - INFO - Epoch(train)   [9][ 50/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:26:08  time: 0.2425  data_time: 0.0541  memory: 4870  grad_norm: 0.0000  loss: 208.8394  loss_cls: 85.1671  loss_bbox: 57.1098  loss_dfl: 66.5625
04/24 01:54:09 - mmengine - INFO - Epoch(train)   [9][100/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:47  time: 0.1731  data_time: 0.0069  memory: 4577  grad_norm: 0.0000  loss: 208.5417  loss_cls: 84.9505  loss_bbox: 57.0286  loss_dfl: 66.5625
04/24 01:54:18 - mmengine - INFO - Epoch(train)   [9][150/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:27  time: 0.1738  data_time: 0.0166  memory: 4510  grad_norm: 0.0000  loss: 209.0994  loss_cls: 85.0374  loss_bbox: 57.4995  loss_dfl: 66.5625
04/24 01:54:27 - mmengine - INFO - Epoch(train)   [9][200/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:08  time: 0.1765  data_time: 0.0088  memory: 4550  grad_norm: 0.0000  loss: 208.5768  loss_cls: 85.0928  loss_bbox: 56.9215  loss_dfl: 66.5625
04/24 01:54:36 - mmengine - INFO - Epoch(train)   [9][250/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:01  time: 0.1972  data_time: 0.0190  memory: 4403  grad_norm: 0.0000  loss: 208.5594  loss_cls: 85.0345  loss_bbox: 56.9624  loss_dfl: 66.5625
04/24 01:54:44 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:54:56 - mmengine - INFO - Epoch(train)  [10][ 50/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:25:07  time: 0.2423  data_time: 0.0514  memory: 4991  grad_norm: 0.0000  loss: 208.5929  loss_cls: 85.0034  loss_bbox: 57.0269  loss_dfl: 66.5625
04/24 01:55:06 - mmengine - INFO - Epoch(train)  [10][100/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:24:53  time: 0.1849  data_time: 0.0157  memory: 4443  grad_norm: 0.0000  loss: 209.1813  loss_cls: 85.0113  loss_bbox: 57.6075  loss_dfl: 66.5625
04/24 01:55:15 - mmengine - INFO - Epoch(train)  [10][150/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:24:39  time: 0.1852  data_time: 0.0119  memory: 4630  grad_norm: 0.0000  loss: 209.0244  loss_cls: 85.0950  loss_bbox: 57.3670  loss_dfl: 66.5625
04/24 01:55:27 - mmengine - INFO - Epoch(train)  [10][200/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:24:55  time: 0.2496  data_time: 0.0425  memory: 4991  grad_norm: 0.0000  loss: 208.7363  loss_cls: 84.8396  loss_bbox: 57.3342  loss_dfl: 66.5625
04/24 01:55:39 - mmengine - INFO - Epoch(train)  [10][250/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:25:01  time: 0.2273  data_time: 0.0430  memory: 4603  grad_norm: 0.0000  loss: 208.3694  loss_cls: 84.9797  loss_bbox: 56.8273  loss_dfl: 66.5625
04/24 01:55:48 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:55:48 - mmengine - INFO - Saving checkpoint at 10 epochs
04/24 01:55:55 - mmengine - INFO - Epoch(val)  [10][  50/1160]    eta: 0:01:06  time: 0.0599  data_time: 0.0112  memory: 4656  
04/24 01:55:58 - mmengine - INFO - Epoch(val)  [10][ 100/1160]    eta: 0:01:02  time: 0.0582  data_time: 0.0052  memory: 597  
04/24 01:56:00 - mmengine - INFO - Epoch(val)  [10][ 150/1160]    eta: 0:00:54  time: 0.0449  data_time: 0.0015  memory: 597  
04/24 01:56:02 - mmengine - INFO - Epoch(val)  [10][ 200/1160]    eta: 0:00:51  time: 0.0503  data_time: 0.0019  memory: 597  
04/24 01:56:04 - mmengine - INFO - Epoch(val)  [10][ 250/1160]    eta: 0:00:45  time: 0.0367  data_time: 0.0011  memory: 597  
04/24 01:56:07 - mmengine - INFO - Epoch(val)  [10][ 300/1160]    eta: 0:00:43  time: 0.0503  data_time: 0.0016  memory: 597  
04/24 01:56:09 - mmengine - INFO - Epoch(val)  [10][ 350/1160]    eta: 0:00:38  time: 0.0362  data_time: 0.0013  memory: 597  
04/24 01:56:11 - mmengine - INFO - Epoch(val)  [10][ 400/1160]    eta: 0:00:36  time: 0.0439  data_time: 0.0015  memory: 597  
04/24 01:56:13 - mmengine - INFO - Epoch(val)  [10][ 450/1160]    eta: 0:00:33  time: 0.0496  data_time: 0.0020  memory: 597  
04/24 01:56:16 - mmengine - INFO - Epoch(val)  [10][ 500/1160]    eta: 0:00:31  time: 0.0478  data_time: 0.0027  memory: 597  
04/24 01:56:17 - mmengine - INFO - Epoch(val)  [10][ 550/1160]    eta: 0:00:28  time: 0.0350  data_time: 0.0013  memory: 597  
04/24 01:56:20 - mmengine - INFO - Epoch(val)  [10][ 600/1160]    eta: 0:00:25  time: 0.0424  data_time: 0.0014  memory: 597  
04/24 01:56:22 - mmengine - INFO - Epoch(val)  [10][ 650/1160]    eta: 0:00:23  time: 0.0520  data_time: 0.0028  memory: 597  
04/24 01:56:25 - mmengine - INFO - Epoch(val)  [10][ 700/1160]    eta: 0:00:21  time: 0.0473  data_time: 0.0032  memory: 597  
04/24 01:56:26 - mmengine - INFO - Epoch(val)  [10][ 750/1160]    eta: 0:00:18  time: 0.0333  data_time: 0.0008  memory: 597  
04/24 01:56:29 - mmengine - INFO - Epoch(val)  [10][ 800/1160]    eta: 0:00:16  time: 0.0478  data_time: 0.0029  memory: 597  
04/24 01:56:31 - mmengine - INFO - Epoch(val)  [10][ 850/1160]    eta: 0:00:14  time: 0.0504  data_time: 0.0018  memory: 597  
04/24 01:56:33 - mmengine - INFO - Epoch(val)  [10][ 900/1160]    eta: 0:00:11  time: 0.0408  data_time: 0.0008  memory: 597  
04/24 01:56:35 - mmengine - INFO - Epoch(val)  [10][ 950/1160]    eta: 0:00:09  time: 0.0361  data_time: 0.0014  memory: 597  
04/24 01:56:37 - mmengine - INFO - Epoch(val)  [10][1000/1160]    eta: 0:00:07  time: 0.0407  data_time: 0.0009  memory: 597  
04/24 01:56:40 - mmengine - INFO - Epoch(val)  [10][1050/1160]    eta: 0:00:05  time: 0.0546  data_time: 0.0041  memory: 597  
04/24 01:56:42 - mmengine - INFO - Epoch(val)  [10][1100/1160]    eta: 0:00:02  time: 0.0513  data_time: 0.0018  memory: 597  
04/24 01:56:44 - mmengine - INFO - Epoch(val)  [10][1150/1160]    eta: 0:00:00  time: 0.0306  data_time: 0.0010  memory: 597  
04/24 01:56:44 - mmengine - INFO - Evaluating bbox...
Loading and preparing results...
04/24 01:56:44 - mmengine - ERROR - /opt/conda/lib/python3.10/site-packages/mmdet/evaluation/metrics/coco_metric.py - compute_metrics - 469 - The testing results of the whole dataset is empty.
04/24 01:56:44 - mmengine - INFO - Epoch(val) [10][1160/1160]    data_time: 0.0023  time: 0.0450
lin-whale commented 5 months ago

I find that adding "load_from" to config file works for the problem above. But I'm not sure using which pretrained model file. It should be load_from = 'pretrained_models/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth' or load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth' ?

zhujiajian98 commented 5 months ago

@Hstwhale Hello, can you use the weight and the corresponding prompt you have trained to predict the specified target?

lin-whale commented 5 months ago

@zhujiajian98 Using the model trained from 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth', I can predict the custom target, but the performance is much worse than fine-tuning directly.

zhujiajian98 commented 5 months ago

@lin-whale Hello, what I'm wondering is if your trained model can make normal predictions, or is it just random predictions? What I suspect now is that there might be an evaluation function or if there is a problem that prevents the evaluation from being completed.

lin-whale commented 5 months ago

@zhujiajian98 After setting load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth', I have solved the problem of loss not decreasing during training, and the predicents are informative now. Just it is not as good as fine-tuning directly which is about 70% AP while the other is about 90% AP.

zhujiajian98 commented 5 months ago

@lin-whale Do you mean that the yolo-world trained by prompt-tuning will be lower?

lin-whale commented 5 months ago

@lin-whale Do you mean that the yolo-world trained by prompt-tuning will be lower?

Yes

chenjiafu-George commented 5 months ago

The training output:

]
use_mask2refine = True
val_ann_file = 'test/2024-03-28-ann-cvat/annotations/annotations.json'
val_batch_size_per_gpu = 1
val_cfg = dict(type='ValLoop')
val_data_prefix = 'test/2024-03-28-ann-cvat/images/'
val_dataloader = dict(
    batch_size=1,
    dataset=dict(
        ann_file=
        '/data/cvat/train/2024-04-02-ann-cvat/annotations/annotations.json.test',
        batch_shapes_cfg=None,
        data_prefix=dict(img='images/'),
        data_root='/data/cvat/train/2024-04-02-ann-cvat',
        filter_cfg=dict(filter_empty_gt=False, min_size=32),
        metainfo=dict(
            classes=(
                'floor',
                'person',
                'paper',
                'bottle',
                'paper cup',
                'mask',
                'thread',
                'waiting bench',
                'sturdy',
                'plastic bag',
                'table',
                'packaging bag',
                'door',
                'carton box',
                'sticker',
                'screen',
                'book',
                'cotton ball',
                'warning sign',
                'rod',
                'poster rack',
                'vomit',
                'blood',
                'traffic cone',
                'trash can',
                'cart',
                'rack',
                'bag',
                'flowerpot',
                'medication',
                'paper box',
                'meal box',
                'pericarp',
                'hat',
                'umbrella',
                'drip stand',
                'coffee stains',
                'elevator entrance',
                'escalator entrance',
                'triage desk',
                'registration machine',
                'fire hydrant',
                'hospital bed',
                'milk stains',
                'plinth',
                'chair',
                'wheel chair',
                'swab',
                'drinking cup',
                'fallen leaves',
            )),
        pipeline=[
            dict(backend_args=None, type='LoadImageFromFile'),
            dict(scale=(
                640,
                640,
            ), type='YOLOv5KeepRatioResize'),
            dict(
                allow_scale_up=False,
                pad_val=dict(img=114),
                scale=(
                    640,
                    640,
                ),
                type='LetterResize'),
            dict(_scope_='mmdet', type='LoadAnnotations', with_bbox=True),
            dict(
                meta_keys=(
                    'img_id',
                    'img_path',
                    'ori_shape',
                    'img_shape',
                    'scale_factor',
                    'pad_param',
                ),
                type='mmdet.PackDetInputs'),
        ],
        test_mode=True,
        type='YOLOv5CocoDataset'),
    drop_last=False,
    num_workers=2,
    persistent_workers=True,
    pin_memory=True,
    sampler=dict(shuffle=False, type='DefaultSampler'))
val_evaluator = dict(
    ann_file=
    '/data/cvat/train/2024-04-02-ann-cvat/annotations/annotations.json.test',
    classwise=True,
    metric='bbox',
    proposal_nums=(
        100,
        1,
        10,
    ),
    type='mmdet.CocoMetric')
val_interval_stage2 = 1
val_num_workers = 2
vis_backends = [
    dict(type='LocalVisBackend'),
]
visualizer = dict(
    name='visualizer',
    type='mmdet.DetLocalVisualizer',
    vis_backends=[
        dict(type='LocalVisBackend'),
    ])
weight_decay = 0.05
widen_factor = 1.0
work_dir = './work_dirs/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco'

04/24 01:45:30 - mmengine - INFO - Distributed training is not used, all SyncBatchNorm (SyncBN) layers in the model will be automatically reverted to BatchNormXd layers if they are used.
04/24 01:45:30 - mmengine - INFO - Hooks will be executed in the following order:
before_run:
(VERY_HIGH   ) RuntimeInfoHook                    
(49          ) EMAHook                            
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
after_load_checkpoint:
(49          ) EMAHook                            
 -------------------- 
before_train:
(9           ) YOLOv5ParamSchedulerHook           
(VERY_HIGH   ) RuntimeInfoHook                    
(49          ) EMAHook                            
(NORMAL      ) IterTimerHook                      
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
before_train_epoch:
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
(NORMAL      ) DistSamplerSeedHook                
(NORMAL      ) PipelineSwitchHook                 
 -------------------- 
before_train_iter:
(9           ) YOLOv5ParamSchedulerHook           
(VERY_HIGH   ) RuntimeInfoHook                    
(NORMAL      ) IterTimerHook                      
 -------------------- 
after_train_iter:
(9           ) YOLOv5ParamSchedulerHook           
(VERY_HIGH   ) RuntimeInfoHook                    
(49          ) EMAHook                            
(NORMAL      ) IterTimerHook                      
(BELOW_NORMAL) LoggerHook                         
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
after_train_epoch:
(9           ) YOLOv5ParamSchedulerHook           
(NORMAL      ) IterTimerHook                      
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
before_val:
(VERY_HIGH   ) RuntimeInfoHook                    
 -------------------- 
before_val_epoch:
(49          ) EMAHook                            
(NORMAL      ) IterTimerHook                      
 -------------------- 
before_val_iter:
(NORMAL      ) IterTimerHook                      
 -------------------- 
after_val_iter:
(NORMAL      ) IterTimerHook                      
(NORMAL      ) DetVisualizationHook               
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
after_val_epoch:
(9           ) YOLOv5ParamSchedulerHook           
(VERY_HIGH   ) RuntimeInfoHook                    
(49          ) EMAHook                            
(NORMAL      ) IterTimerHook                      
(BELOW_NORMAL) LoggerHook                         
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
after_val:
(VERY_HIGH   ) RuntimeInfoHook                    
 -------------------- 
before_save_checkpoint:
(49          ) EMAHook                            
 -------------------- 
after_train:
(VERY_HIGH   ) RuntimeInfoHook                    
(VERY_LOW    ) CheckpointHook                     
 -------------------- 
before_test:
(VERY_HIGH   ) RuntimeInfoHook                    
 -------------------- 
before_test_epoch:
(49          ) EMAHook                            
(NORMAL      ) IterTimerHook                      
 -------------------- 
before_test_iter:
(NORMAL      ) IterTimerHook                      
 -------------------- 
after_test_iter:
(NORMAL      ) IterTimerHook                      
(NORMAL      ) DetVisualizationHook               
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
after_test_epoch:
(VERY_HIGH   ) RuntimeInfoHook                    
(49          ) EMAHook                            
(NORMAL      ) IterTimerHook                      
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
after_test:
(VERY_HIGH   ) RuntimeInfoHook                    
 -------------------- 
after_run:
(BELOW_NORMAL) LoggerHook                         
 -------------------- 
loading annotations into memory...
Done (t=1.32s)
creating index...
index created!
04/24 01:45:34 - mmengine - INFO - paramwise_options -- embeddings:lr=0.002
04/24 01:45:34 - mmengine - INFO - paramwise_options -- embeddings:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.main_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.main_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.final_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.final_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.0.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.0.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.0.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.0.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.1.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.1.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.1.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.1.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.2.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.2.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.2.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.blocks.2.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.attn_block.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.attn_block.guide_fc.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.attn_block.project_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.0.attn_block.project_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.main_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.main_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.final_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.final_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.0.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.0.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.0.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.0.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.1.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.1.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.1.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.1.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.2.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.2.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.2.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.blocks.2.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.attn_block.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.attn_block.guide_fc.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.attn_block.project_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.top_down_layers.1.attn_block.project_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.downsample_layers.0.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.downsample_layers.0.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.downsample_layers.1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.downsample_layers.1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.main_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.main_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.final_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.final_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.0.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.0.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.0.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.0.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.1.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.1.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.1.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.1.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.2.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.2.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.2.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.blocks.2.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.attn_block.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.attn_block.guide_fc.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.attn_block.project_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.0.attn_block.project_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.main_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.main_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.final_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.final_conv.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.0.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.0.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.0.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.0.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.1.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.1.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.1.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.1.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.2.conv1.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.2.conv1.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.2.conv2.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.blocks.2.conv2.bn.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.attn_block.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.attn_block.guide_fc.bias:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.attn_block.project_conv.bn.weight:weight_decay=0.0
04/24 01:45:34 - mmengine - INFO - paramwise_options -- neck.bottom_up_layers.1.attn_block.project_conv.bn.bias:weight_decay=0.0
loading annotations into memory...
Done (t=0.25s)
creating index...
index created!
loading annotations into memory...
Done (t=0.20s)
creating index...
index created!
04/24 01:45:39 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io
04/24 01:45:39 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future.
04/24 01:45:39 - mmengine - INFO - Checkpoints will be saved to /home/aistar/yolo-world/YOLO-World/work_dirs/yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco.
/opt/conda/lib/python3.10/site-packages/torch/functional.py:507: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /opt/conda/conda-bld/pytorch_1711403380909/work/aten/src/ATen/native/TensorShape.cpp:3549.)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
04/24 01:45:59 - mmengine - INFO - Epoch(train)   [1][ 50/290]  base_lr: 2.0000e-03 lr: 9.8000e-05  eta: 3:15:16  time: 0.4047  data_time: 0.0721  memory: 10524  grad_norm: 0.0000  loss: 209.2231  loss_cls: 85.0021  loss_bbox: 57.6584  loss_dfl: 66.5625
04/24 01:46:11 - mmengine - INFO - Epoch(train)   [1][100/290]  base_lr: 2.0000e-03 lr: 1.9800e-04  eta: 2:36:04  time: 0.2433  data_time: 0.0477  memory: 4760  grad_norm: 0.0000  loss: 208.6729  loss_cls: 84.9814  loss_bbox: 57.1290  loss_dfl: 66.5625
04/24 01:46:20 - mmengine - INFO - Epoch(train)   [1][150/290]  base_lr: 2.0000e-03 lr: 2.9800e-04  eta: 2:12:05  time: 0.1761  data_time: 0.0129  memory: 4706  grad_norm: 0.0000  loss: 208.2608  loss_cls: 84.9480  loss_bbox: 56.7503  loss_dfl: 66.5625
04/24 01:46:29 - mmengine - INFO - Epoch(train)   [1][200/290]  base_lr: 2.0000e-03 lr: 3.9800e-04  eta: 2:00:17  time: 0.1783  data_time: 0.0173  memory: 4880  grad_norm: 0.0000  loss: 208.5585  loss_cls: 84.8515  loss_bbox: 57.1445  loss_dfl: 66.5625
04/24 01:46:37 - mmengine - INFO - Epoch(train)   [1][250/290]  base_lr: 2.0000e-03 lr: 4.9800e-04  eta: 1:52:12  time: 0.1685  data_time: 0.0032  memory: 4640  grad_norm: 0.0000  loss: 208.4256  loss_cls: 84.9207  loss_bbox: 56.9424  loss_dfl: 66.5625
04/24 01:46:45 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:46:57 - mmengine - INFO - Epoch(train)   [2][ 50/290]  base_lr: 2.0000e-03 lr: 6.7129e-04  eta: 1:48:58  time: 0.2320  data_time: 0.0606  memory: 4920  grad_norm: 0.0000  loss: 208.7070  loss_cls: 85.1418  loss_bbox: 57.0027  loss_dfl: 66.5625
04/24 01:47:05 - mmengine - INFO - Epoch(train)   [2][100/290]  base_lr: 2.0000e-03 lr: 7.7030e-04  eta: 1:45:39  time: 0.1772  data_time: 0.0228  memory: 4640  grad_norm: 0.0000  loss: 209.1946  loss_cls: 85.3240  loss_bbox: 57.3081  loss_dfl: 66.5625
04/24 01:47:14 - mmengine - INFO - Epoch(train)   [2][150/290]  base_lr: 2.0000e-03 lr: 8.6931e-04  eta: 1:43:13  time: 0.1798  data_time: 0.0082  memory: 4706  grad_norm: 0.0000  loss: 208.6277  loss_cls: 85.0666  loss_bbox: 56.9986  loss_dfl: 66.5625
04/24 01:47:23 - mmengine - INFO - Epoch(train)   [2][200/290]  base_lr: 2.0000e-03 lr: 9.6832e-04  eta: 1:40:57  time: 0.1738  data_time: 0.0115  memory: 4560  grad_norm: 0.0000  loss: 209.1859  loss_cls: 85.0124  loss_bbox: 57.6110  loss_dfl: 66.5625
04/24 01:47:32 - mmengine - INFO - Epoch(train)   [2][250/290]  base_lr: 2.0000e-03 lr: 1.0673e-03  eta: 1:39:16  time: 0.1781  data_time: 0.0104  memory: 4520  grad_norm: 0.0000  loss: 208.7904  loss_cls: 85.1610  loss_bbox: 57.0668  loss_dfl: 66.5625
04/24 01:47:39 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:47:51 - mmengine - INFO - Epoch(train)   [3][ 50/290]  base_lr: 2.0000e-03 lr: 1.2331e-03  eta: 1:39:06  time: 0.2327  data_time: 0.0540  memory: 4760  grad_norm: 0.0000  loss: 209.1261  loss_cls: 85.0435  loss_bbox: 57.5202  loss_dfl: 66.5625
04/24 01:48:00 - mmengine - INFO - Epoch(train)   [3][100/290]  base_lr: 2.0000e-03 lr: 1.3311e-03  eta: 1:37:54  time: 0.1798  data_time: 0.0118  memory: 4560  grad_norm: 0.0000  loss: 208.6577  loss_cls: 84.9895  loss_bbox: 57.1057  loss_dfl: 66.5625
04/24 01:48:09 - mmengine - INFO - Epoch(train)   [3][150/290]  base_lr: 2.0000e-03 lr: 1.4291e-03  eta: 1:36:45  time: 0.1771  data_time: 0.0109  memory: 4813  grad_norm: 0.0000  loss: 208.6249  loss_cls: 84.9350  loss_bbox: 57.1273  loss_dfl: 66.5625
04/24 01:48:18 - mmengine - INFO - Epoch(train)   [3][200/290]  base_lr: 2.0000e-03 lr: 1.5272e-03  eta: 1:35:39  time: 0.1746  data_time: 0.0081  memory: 4507  grad_norm: 0.0000  loss: 208.9671  loss_cls: 85.0744  loss_bbox: 57.3302  loss_dfl: 66.5625
04/24 01:48:26 - mmengine - INFO - Epoch(train)   [3][250/290]  base_lr: 2.0000e-03 lr: 1.6252e-03  eta: 1:34:13  time: 0.1587  data_time: 0.0127  memory: 4666  grad_norm: 0.0000  loss: 208.3120  loss_cls: 84.8422  loss_bbox: 56.9073  loss_dfl: 66.5625
04/24 01:48:34 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:48:45 - mmengine - INFO - Epoch(train)   [4][ 50/290]  base_lr: 2.0000e-03 lr: 1.7834e-03  eta: 1:34:26  time: 0.2219  data_time: 0.0478  memory: 4733  grad_norm: 0.0000  loss: 208.5537  loss_cls: 84.9261  loss_bbox: 57.0651  loss_dfl: 66.5625
04/24 01:48:54 - mmengine - INFO - Epoch(train)   [4][100/290]  base_lr: 2.0000e-03 lr: 1.8804e-03  eta: 1:33:37  time: 0.1745  data_time: 0.0155  memory: 4666  grad_norm: 0.0000  loss: 208.6375  loss_cls: 85.1082  loss_bbox: 56.9668  loss_dfl: 66.5625
04/24 01:48:59 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:49:02 - mmengine - INFO - Epoch(train)   [4][150/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:32:50  time: 0.1735  data_time: 0.0153  memory: 4813  grad_norm: 0.0000  loss: 209.0560  loss_cls: 84.9764  loss_bbox: 57.5171  loss_dfl: 66.5625
04/24 01:49:11 - mmengine - INFO - Epoch(train)   [4][200/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:32:11  time: 0.1771  data_time: 0.0158  memory: 4493  grad_norm: 0.0000  loss: 208.8431  loss_cls: 84.9746  loss_bbox: 57.3060  loss_dfl: 66.5625
04/24 01:49:20 - mmengine - INFO - Epoch(train)   [4][250/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:31:45  time: 0.1847  data_time: 0.0158  memory: 4507  grad_norm: 0.0000  loss: 208.0752  loss_cls: 84.9316  loss_bbox: 56.5811  loss_dfl: 66.5625
04/24 01:49:27 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:49:38 - mmengine - INFO - Epoch(train)   [5][ 50/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:31:28  time: 0.2187  data_time: 0.0511  memory: 4466  grad_norm: 0.0000  loss: 208.7696  loss_cls: 85.0308  loss_bbox: 57.1764  loss_dfl: 66.5625
04/24 01:49:47 - mmengine - INFO - Epoch(train)   [5][100/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:30:50  time: 0.1719  data_time: 0.0072  memory: 4613  grad_norm: 0.0000  loss: 208.6188  loss_cls: 84.9198  loss_bbox: 57.1365  loss_dfl: 66.5625
04/24 01:49:57 - mmengine - INFO - Epoch(train)   [5][150/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:30:52  time: 0.2073  data_time: 0.0197  memory: 4840  grad_norm: 0.0000  loss: 209.2043  loss_cls: 85.1290  loss_bbox: 57.5128  loss_dfl: 66.5625
04/24 01:50:06 - mmengine - INFO - Epoch(train)   [5][200/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:30:23  time: 0.1782  data_time: 0.0112  memory: 4746  grad_norm: 0.0000  loss: 208.6433  loss_cls: 84.9482  loss_bbox: 57.1325  loss_dfl: 66.5625
04/24 01:50:15 - mmengine - INFO - Epoch(train)   [5][250/290]  base_lr: 2.0000e-03 lr: 1.9406e-03  eta: 1:29:50  time: 0.1720  data_time: 0.0104  memory: 4480  grad_norm: 0.0000  loss: 209.0454  loss_cls: 85.0755  loss_bbox: 57.4073  loss_dfl: 66.5625
04/24 01:50:22 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:50:22 - mmengine - INFO - Saving checkpoint at 5 epochs
04/24 01:50:24 - mmengine - WARNING - `save_param_scheduler` is True but `self.param_schedulers` is None, so skip saving parameter schedulers
04/24 01:50:29 - mmengine - INFO - Epoch(val)   [5][  50/1160]    eta: 0:01:21  time: 0.0738  data_time: 0.0058  memory: 4533  
04/24 01:50:30 - mmengine - INFO - Epoch(val)   [5][ 100/1160]    eta: 0:00:56  time: 0.0323  data_time: 0.0016  memory: 596  
04/24 01:50:32 - mmengine - INFO - Epoch(val)   [5][ 150/1160]    eta: 0:00:45  time: 0.0303  data_time: 0.0006  memory: 596  
04/24 01:50:33 - mmengine - INFO - Epoch(val)   [5][ 200/1160]    eta: 0:00:39  time: 0.0281  data_time: 0.0005  memory: 596  
04/24 01:50:35 - mmengine - INFO - Epoch(val)   [5][ 250/1160]    eta: 0:00:34  time: 0.0263  data_time: 0.0012  memory: 596  
04/24 01:50:36 - mmengine - INFO - Epoch(val)   [5][ 300/1160]    eta: 0:00:32  time: 0.0369  data_time: 0.0007  memory: 596  
04/24 01:50:38 - mmengine - INFO - Epoch(val)   [5][ 350/1160]    eta: 0:00:29  time: 0.0250  data_time: 0.0005  memory: 596  
04/24 01:50:39 - mmengine - INFO - Epoch(val)   [5][ 400/1160]    eta: 0:00:26  time: 0.0295  data_time: 0.0009  memory: 596  
04/24 01:50:41 - mmengine - INFO - Epoch(val)   [5][ 450/1160]    eta: 0:00:24  time: 0.0288  data_time: 0.0006  memory: 596  
04/24 01:50:42 - mmengine - INFO - Epoch(val)   [5][ 500/1160]    eta: 0:00:22  time: 0.0240  data_time: 0.0009  memory: 596  
04/24 01:50:44 - mmengine - INFO - Epoch(val)   [5][ 550/1160]    eta: 0:00:20  time: 0.0361  data_time: 0.0011  memory: 596  
04/24 01:50:45 - mmengine - INFO - Epoch(val)   [5][ 600/1160]    eta: 0:00:18  time: 0.0281  data_time: 0.0009  memory: 596  
04/24 01:50:47 - mmengine - INFO - Epoch(val)   [5][ 650/1160]    eta: 0:00:16  time: 0.0318  data_time: 0.0012  memory: 596  
04/24 01:50:49 - mmengine - INFO - Epoch(val)   [5][ 700/1160]    eta: 0:00:15  time: 0.0380  data_time: 0.0012  memory: 596  
04/24 01:50:50 - mmengine - INFO - Epoch(val)   [5][ 750/1160]    eta: 0:00:13  time: 0.0320  data_time: 0.0007  memory: 596  
04/24 01:50:52 - mmengine - INFO - Epoch(val)   [5][ 800/1160]    eta: 0:00:11  time: 0.0265  data_time: 0.0008  memory: 596  
04/24 01:50:53 - mmengine - INFO - Epoch(val)   [5][ 850/1160]    eta: 0:00:10  time: 0.0225  data_time: 0.0012  memory: 596  
04/24 01:50:54 - mmengine - INFO - Epoch(val)   [5][ 900/1160]    eta: 0:00:08  time: 0.0330  data_time: 0.0015  memory: 596  
04/24 01:50:56 - mmengine - INFO - Epoch(val)   [5][ 950/1160]    eta: 0:00:06  time: 0.0290  data_time: 0.0009  memory: 596  
04/24 01:50:58 - mmengine - INFO - Epoch(val)   [5][1000/1160]    eta: 0:00:05  time: 0.0367  data_time: 0.0016  memory: 596  
04/24 01:50:59 - mmengine - INFO - Epoch(val)   [5][1050/1160]    eta: 0:00:03  time: 0.0285  data_time: 0.0007  memory: 596  
04/24 01:51:00 - mmengine - INFO - Epoch(val)   [5][1100/1160]    eta: 0:00:01  time: 0.0283  data_time: 0.0008  memory: 596  
04/24 01:51:02 - mmengine - INFO - Epoch(val)   [5][1150/1160]    eta: 0:00:00  time: 0.0357  data_time: 0.0015  memory: 596  
04/24 01:51:02 - mmengine - INFO - Evaluating bbox...
Loading and preparing results...
04/24 01:51:02 - mmengine - ERROR - /opt/conda/lib/python3.10/site-packages/mmdet/evaluation/metrics/coco_metric.py - compute_metrics - 469 - The testing results of the whole dataset is empty.
04/24 01:51:02 - mmengine - INFO - Epoch(val) [5][1160/1160]    data_time: 0.0012  time: 0.0321
04/24 01:51:02 - mmengine - WARNING - Since `metrics` is an empty dict, the behavior to save the best checkpoint will be skipped in this evaluation.
04/24 01:51:15 - mmengine - INFO - Epoch(train)   [6][ 50/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:30:04  time: 0.2459  data_time: 0.0610  memory: 4564  grad_norm: 0.0000  loss: 208.6450  loss_cls: 85.1488  loss_bbox: 56.9337  loss_dfl: 66.5625
04/24 01:51:24 - mmengine - INFO - Epoch(train)   [6][100/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:29:36  time: 0.1752  data_time: 0.0077  memory: 4577  grad_norm: 0.0000  loss: 208.5121  loss_cls: 84.7862  loss_bbox: 57.1634  loss_dfl: 66.5625
04/24 01:51:34 - mmengine - INFO - Epoch(train)   [6][150/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:29:31  time: 0.2024  data_time: 0.0244  memory: 4430  grad_norm: 0.0000  loss: 209.0030  loss_cls: 85.0959  loss_bbox: 57.3446  loss_dfl: 66.5625
04/24 01:51:42 - mmengine - INFO - Epoch(train)   [6][200/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:29:05  time: 0.1756  data_time: 0.0224  memory: 4671  grad_norm: 0.0000  loss: 208.4982  loss_cls: 84.8802  loss_bbox: 57.0555  loss_dfl: 66.5625
04/24 01:51:51 - mmengine - INFO - Epoch(train)   [6][250/290]  base_lr: 2.0000e-03 lr: 1.9208e-03  eta: 1:28:40  time: 0.1770  data_time: 0.0078  memory: 4416  grad_norm: 0.0000  loss: 208.7319  loss_cls: 85.0956  loss_bbox: 57.0738  loss_dfl: 66.5625
04/24 01:51:59 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:52:11 - mmengine - INFO - Epoch(train)   [7][ 50/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:28:51  time: 0.2433  data_time: 0.0595  memory: 4643  grad_norm: 0.0000  loss: 209.1036  loss_cls: 85.1163  loss_bbox: 57.4247  loss_dfl: 66.5625
04/24 01:52:19 - mmengine - INFO - Epoch(train)   [7][100/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:28:24  time: 0.1728  data_time: 0.0014  memory: 4443  grad_norm: 0.0000  loss: 208.8363  loss_cls: 84.8555  loss_bbox: 57.4183  loss_dfl: 66.5625
04/24 01:52:28 - mmengine - INFO - Epoch(train)   [7][150/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:28:01  time: 0.1766  data_time: 0.0150  memory: 4643  grad_norm: 0.0000  loss: 209.2883  loss_cls: 85.2692  loss_bbox: 57.4565  loss_dfl: 66.5625
04/24 01:52:37 - mmengine - INFO - Epoch(train)   [7][200/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:27:38  time: 0.1756  data_time: 0.0093  memory: 4590  grad_norm: 0.0000  loss: 209.3806  loss_cls: 85.1305  loss_bbox: 57.6876  loss_dfl: 66.5625
04/24 01:52:46 - mmengine - INFO - Epoch(train)   [7][250/290]  base_lr: 2.0000e-03 lr: 1.9010e-03  eta: 1:27:23  time: 0.1872  data_time: 0.0144  memory: 4656  grad_norm: 0.0000  loss: 209.1152  loss_cls: 84.9242  loss_bbox: 57.6285  loss_dfl: 66.5625
04/24 01:52:49 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:52:54 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:53:05 - mmengine - INFO - Epoch(train)   [8][ 50/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:27:13  time: 0.2192  data_time: 0.0511  memory: 4630  grad_norm: 0.0000  loss: 209.1739  loss_cls: 85.0660  loss_bbox: 57.5453  loss_dfl: 66.5625
04/24 01:53:15 - mmengine - INFO - Epoch(train)   [8][100/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:27:07  time: 0.2008  data_time: 0.0103  memory: 4443  grad_norm: 0.0000  loss: 208.7517  loss_cls: 85.0137  loss_bbox: 57.1755  loss_dfl: 66.5625
04/24 01:53:23 - mmengine - INFO - Epoch(train)   [8][150/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:26:44  time: 0.1729  data_time: 0.0097  memory: 4523  grad_norm: 0.0000  loss: 208.3930  loss_cls: 85.0247  loss_bbox: 56.8058  loss_dfl: 66.5625
04/24 01:53:32 - mmengine - INFO - Epoch(train)   [8][200/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:26:22  time: 0.1729  data_time: 0.0050  memory: 4910  grad_norm: 0.0000  loss: 208.6075  loss_cls: 85.0593  loss_bbox: 56.9856  loss_dfl: 66.5625
04/24 01:53:41 - mmengine - INFO - Epoch(train)   [8][250/290]  base_lr: 2.0000e-03 lr: 1.8812e-03  eta: 1:26:00  time: 0.1724  data_time: 0.0069  memory: 4603  grad_norm: 0.0000  loss: 208.7036  loss_cls: 85.0302  loss_bbox: 57.1108  loss_dfl: 66.5625
04/24 01:53:48 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:54:00 - mmengine - INFO - Epoch(train)   [9][ 50/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:26:08  time: 0.2425  data_time: 0.0541  memory: 4870  grad_norm: 0.0000  loss: 208.8394  loss_cls: 85.1671  loss_bbox: 57.1098  loss_dfl: 66.5625
04/24 01:54:09 - mmengine - INFO - Epoch(train)   [9][100/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:47  time: 0.1731  data_time: 0.0069  memory: 4577  grad_norm: 0.0000  loss: 208.5417  loss_cls: 84.9505  loss_bbox: 57.0286  loss_dfl: 66.5625
04/24 01:54:18 - mmengine - INFO - Epoch(train)   [9][150/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:27  time: 0.1738  data_time: 0.0166  memory: 4510  grad_norm: 0.0000  loss: 209.0994  loss_cls: 85.0374  loss_bbox: 57.4995  loss_dfl: 66.5625
04/24 01:54:27 - mmengine - INFO - Epoch(train)   [9][200/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:08  time: 0.1765  data_time: 0.0088  memory: 4550  grad_norm: 0.0000  loss: 208.5768  loss_cls: 85.0928  loss_bbox: 56.9215  loss_dfl: 66.5625
04/24 01:54:36 - mmengine - INFO - Epoch(train)   [9][250/290]  base_lr: 2.0000e-03 lr: 1.8614e-03  eta: 1:25:01  time: 0.1972  data_time: 0.0190  memory: 4403  grad_norm: 0.0000  loss: 208.5594  loss_cls: 85.0345  loss_bbox: 56.9624  loss_dfl: 66.5625
04/24 01:54:44 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:54:56 - mmengine - INFO - Epoch(train)  [10][ 50/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:25:07  time: 0.2423  data_time: 0.0514  memory: 4991  grad_norm: 0.0000  loss: 208.5929  loss_cls: 85.0034  loss_bbox: 57.0269  loss_dfl: 66.5625
04/24 01:55:06 - mmengine - INFO - Epoch(train)  [10][100/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:24:53  time: 0.1849  data_time: 0.0157  memory: 4443  grad_norm: 0.0000  loss: 209.1813  loss_cls: 85.0113  loss_bbox: 57.6075  loss_dfl: 66.5625
04/24 01:55:15 - mmengine - INFO - Epoch(train)  [10][150/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:24:39  time: 0.1852  data_time: 0.0119  memory: 4630  grad_norm: 0.0000  loss: 209.0244  loss_cls: 85.0950  loss_bbox: 57.3670  loss_dfl: 66.5625
04/24 01:55:27 - mmengine - INFO - Epoch(train)  [10][200/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:24:55  time: 0.2496  data_time: 0.0425  memory: 4991  grad_norm: 0.0000  loss: 208.7363  loss_cls: 84.8396  loss_bbox: 57.3342  loss_dfl: 66.5625
04/24 01:55:39 - mmengine - INFO - Epoch(train)  [10][250/290]  base_lr: 2.0000e-03 lr: 1.8416e-03  eta: 1:25:01  time: 0.2273  data_time: 0.0430  memory: 4603  grad_norm: 0.0000  loss: 208.3694  loss_cls: 84.9797  loss_bbox: 56.8273  loss_dfl: 66.5625
04/24 01:55:48 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:55:48 - mmengine - INFO - Saving checkpoint at 10 epochs
04/24 01:55:55 - mmengine - INFO - Epoch(val)  [10][  50/1160]    eta: 0:01:06  time: 0.0599  data_time: 0.0112  memory: 4656  
04/24 01:55:58 - mmengine - INFO - Epoch(val)  [10][ 100/1160]    eta: 0:01:02  time: 0.0582  data_time: 0.0052  memory: 597  
04/24 01:56:00 - mmengine - INFO - Epoch(val)  [10][ 150/1160]    eta: 0:00:54  time: 0.0449  data_time: 0.0015  memory: 597  
04/24 01:56:02 - mmengine - INFO - Epoch(val)  [10][ 200/1160]    eta: 0:00:51  time: 0.0503  data_time: 0.0019  memory: 597  
04/24 01:56:04 - mmengine - INFO - Epoch(val)  [10][ 250/1160]    eta: 0:00:45  time: 0.0367  data_time: 0.0011  memory: 597  
04/24 01:56:07 - mmengine - INFO - Epoch(val)  [10][ 300/1160]    eta: 0:00:43  time: 0.0503  data_time: 0.0016  memory: 597  
04/24 01:56:09 - mmengine - INFO - Epoch(val)  [10][ 350/1160]    eta: 0:00:38  time: 0.0362  data_time: 0.0013  memory: 597  
04/24 01:56:11 - mmengine - INFO - Epoch(val)  [10][ 400/1160]    eta: 0:00:36  time: 0.0439  data_time: 0.0015  memory: 597  
04/24 01:56:13 - mmengine - INFO - Epoch(val)  [10][ 450/1160]    eta: 0:00:33  time: 0.0496  data_time: 0.0020  memory: 597  
04/24 01:56:16 - mmengine - INFO - Epoch(val)  [10][ 500/1160]    eta: 0:00:31  time: 0.0478  data_time: 0.0027  memory: 597  
04/24 01:56:17 - mmengine - INFO - Epoch(val)  [10][ 550/1160]    eta: 0:00:28  time: 0.0350  data_time: 0.0013  memory: 597  
04/24 01:56:20 - mmengine - INFO - Epoch(val)  [10][ 600/1160]    eta: 0:00:25  time: 0.0424  data_time: 0.0014  memory: 597  
04/24 01:56:22 - mmengine - INFO - Epoch(val)  [10][ 650/1160]    eta: 0:00:23  time: 0.0520  data_time: 0.0028  memory: 597  
04/24 01:56:25 - mmengine - INFO - Epoch(val)  [10][ 700/1160]    eta: 0:00:21  time: 0.0473  data_time: 0.0032  memory: 597  
04/24 01:56:26 - mmengine - INFO - Epoch(val)  [10][ 750/1160]    eta: 0:00:18  time: 0.0333  data_time: 0.0008  memory: 597  
04/24 01:56:29 - mmengine - INFO - Epoch(val)  [10][ 800/1160]    eta: 0:00:16  time: 0.0478  data_time: 0.0029  memory: 597  
04/24 01:56:31 - mmengine - INFO - Epoch(val)  [10][ 850/1160]    eta: 0:00:14  time: 0.0504  data_time: 0.0018  memory: 597  
04/24 01:56:33 - mmengine - INFO - Epoch(val)  [10][ 900/1160]    eta: 0:00:11  time: 0.0408  data_time: 0.0008  memory: 597  
04/24 01:56:35 - mmengine - INFO - Epoch(val)  [10][ 950/1160]    eta: 0:00:09  time: 0.0361  data_time: 0.0014  memory: 597  
04/24 01:56:37 - mmengine - INFO - Epoch(val)  [10][1000/1160]    eta: 0:00:07  time: 0.0407  data_time: 0.0009  memory: 597  
04/24 01:56:40 - mmengine - INFO - Epoch(val)  [10][1050/1160]    eta: 0:00:05  time: 0.0546  data_time: 0.0041  memory: 597  
04/24 01:56:42 - mmengine - INFO - Epoch(val)  [10][1100/1160]    eta: 0:00:02  time: 0.0513  data_time: 0.0018  memory: 597  
04/24 01:56:44 - mmengine - INFO - Epoch(val)  [10][1150/1160]    eta: 0:00:00  time: 0.0306  data_time: 0.0010  memory: 597  
04/24 01:56:44 - mmengine - INFO - Evaluating bbox...
Loading and preparing results...
04/24 01:56:44 - mmengine - ERROR - /opt/conda/lib/python3.10/site-packages/mmdet/evaluation/metrics/coco_metric.py - compute_metrics - 469 - The testing results of the whole dataset is empty.
04/24 01:56:44 - mmengine - INFO - Epoch(val) [10][1160/1160]    data_time: 0.0023  time: 0.0450
04/24 01:56:56 - mmengine - INFO - Epoch(train)  [11][ 50/290]  base_lr: 2.0000e-03 lr: 1.8218e-03  eta: 1:25:15  time: 0.2441  data_time: 0.0684  memory: 4510  grad_norm: 0.0000  loss: 208.4549  loss_cls: 85.0023  loss_bbox: 56.8901  loss_dfl: 66.5625
04/24 01:57:05 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:57:05 - mmengine - INFO - Epoch(train)  [11][100/290]  base_lr: 2.0000e-03 lr: 1.8218e-03  eta: 1:24:55  time: 0.1744  data_time: 0.0112  memory: 4843  grad_norm: 0.0000  loss: 208.3745  loss_cls: 85.1314  loss_bbox: 56.6806  loss_dfl: 66.5625
04/24 01:57:14 - mmengine - INFO - Epoch(train)  [11][150/290]  base_lr: 2.0000e-03 lr: 1.8218e-03  eta: 1:24:38  time: 0.1796  data_time: 0.0189  memory: 4616  grad_norm: 0.0000  loss: 209.1933  loss_cls: 85.0257  loss_bbox: 57.6051  loss_dfl: 66.5625
04/24 01:57:23 - mmengine - INFO - Epoch(train)  [11][200/290]  base_lr: 2.0000e-03 lr: 1.8218e-03  eta: 1:24:20  time: 0.1740  data_time: 0.0064  memory: 4683  grad_norm: 0.0000  loss: 209.2356  loss_cls: 85.1660  loss_bbox: 57.5071  loss_dfl: 66.5625
04/24 01:57:33 - mmengine - INFO - Epoch(train)  [11][250/290]  base_lr: 2.0000e-03 lr: 1.8218e-03  eta: 1:24:12  time: 0.2011  data_time: 0.0146  memory: 4483  grad_norm: 0.0000  loss: 208.3271  loss_cls: 84.9534  loss_bbox: 56.8112  loss_dfl: 66.5625
04/24 01:57:40 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:57:52 - mmengine - INFO - Epoch(train)  [12][ 50/290]  base_lr: 2.0000e-03 lr: 1.8020e-03  eta: 1:24:03  time: 0.2287  data_time: 0.0580  memory: 4456  grad_norm: 0.0000  loss: 209.1542  loss_cls: 85.0694  loss_bbox: 57.5223  loss_dfl: 66.5625
04/24 01:58:02 - mmengine - INFO - Epoch(train)  [12][100/290]  base_lr: 2.0000e-03 lr: 1.8020e-03  eta: 1:23:56  time: 0.2051  data_time: 0.0225  memory: 4430  grad_norm: 0.0000  loss: 208.7917  loss_cls: 84.9126  loss_bbox: 57.3165  loss_dfl: 66.5625
04/24 01:58:11 - mmengine - INFO - Epoch(train)  [12][150/290]  base_lr: 2.0000e-03 lr: 1.8020e-03  eta: 1:23:38  time: 0.1733  data_time: 0.0099  memory: 4683  grad_norm: 0.0000  loss: 208.8347  loss_cls: 84.9103  loss_bbox: 57.3619  loss_dfl: 66.5625
04/24 01:58:20 - mmengine - INFO - Epoch(train)  [12][200/290]  base_lr: 2.0000e-03 lr: 1.8020e-03  eta: 1:23:22  time: 0.1793  data_time: 0.0138  memory: 4656  grad_norm: 0.0000  loss: 209.0807  loss_cls: 85.1878  loss_bbox: 57.3304  loss_dfl: 66.5625
04/24 01:58:29 - mmengine - INFO - Epoch(train)  [12][250/290]  base_lr: 2.0000e-03 lr: 1.8020e-03  eta: 1:23:12  time: 0.1941  data_time: 0.0114  memory: 4963  grad_norm: 0.0000  loss: 208.7168  loss_cls: 85.1470  loss_bbox: 57.0073  loss_dfl: 66.5625
04/24 01:58:36 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:58:48 - mmengine - INFO - Epoch(train)  [13][ 50/290]  base_lr: 2.0000e-03 lr: 1.7822e-03  eta: 1:22:59  time: 0.2293  data_time: 0.0632  memory: 4816  grad_norm: 0.0000  loss: 208.5295  loss_cls: 84.9813  loss_bbox: 56.9857  loss_dfl: 66.5625
04/24 01:58:57 - mmengine - INFO - Epoch(train)  [13][100/290]  base_lr: 2.0000e-03 lr: 1.7822e-03  eta: 1:22:45  time: 0.1838  data_time: 0.0203  memory: 4656  grad_norm: 0.0000  loss: 209.1732  loss_cls: 85.0846  loss_bbox: 57.5261  loss_dfl: 66.5625
04/24 01:59:06 - mmengine - INFO - Epoch(train)  [13][150/290]  base_lr: 2.0000e-03 lr: 1.7822e-03  eta: 1:22:29  time: 0.1767  data_time: 0.0126  memory: 4883  grad_norm: 0.0000  loss: 208.6712  loss_cls: 84.8874  loss_bbox: 57.2213  loss_dfl: 66.5625
04/24 01:59:15 - mmengine - INFO - Epoch(train)  [13][200/290]  base_lr: 2.0000e-03 lr: 1.7822e-03  eta: 1:22:13  time: 0.1783  data_time: 0.0155  memory: 4671  grad_norm: 0.0000  loss: 208.8264  loss_cls: 85.0155  loss_bbox: 57.2485  loss_dfl: 66.5625
04/24 01:59:24 - mmengine - INFO - Epoch(train)  [13][250/290]  base_lr: 2.0000e-03 lr: 1.7822e-03  eta: 1:22:04  time: 0.1963  data_time: 0.0159  memory: 4523  grad_norm: 0.0000  loss: 208.7948  loss_cls: 84.9801  loss_bbox: 57.2522  loss_dfl: 66.5625
04/24 01:59:32 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 01:59:43 - mmengine - INFO - Epoch(train)  [14][ 50/290]  base_lr: 2.0000e-03 lr: 1.7624e-03  eta: 1:21:56  time: 0.2285  data_time: 0.0670  memory: 4630  grad_norm: 0.0000  loss: 209.0704  loss_cls: 85.0209  loss_bbox: 57.4870  loss_dfl: 66.5625
04/24 01:59:53 - mmengine - INFO - Epoch(train)  [14][100/290]  base_lr: 2.0000e-03 lr: 1.7624e-03  eta: 1:21:42  time: 0.1840  data_time: 0.0313  memory: 4590  grad_norm: 0.0000  loss: 208.8153  loss_cls: 85.0152  loss_bbox: 57.2376  loss_dfl: 66.5625
04/24 02:00:02 - mmengine - INFO - Epoch(train)  [14][150/290]  base_lr: 2.0000e-03 lr: 1.7624e-03  eta: 1:21:32  time: 0.1932  data_time: 0.0170  memory: 4723  grad_norm: 0.0000  loss: 208.6665  loss_cls: 84.9501  loss_bbox: 57.1539  loss_dfl: 66.5625
04/24 02:00:11 - mmengine - INFO - Epoch(train)  [14][200/290]  base_lr: 2.0000e-03 lr: 1.7624e-03  eta: 1:21:15  time: 0.1714  data_time: 0.0092  memory: 4550  grad_norm: 0.0000  loss: 208.7670  loss_cls: 84.8790  loss_bbox: 57.3255  loss_dfl: 66.5625
04/24 02:00:16 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:00:20 - mmengine - INFO - Epoch(train)  [14][250/290]  base_lr: 2.0000e-03 lr: 1.7624e-03  eta: 1:20:59  time: 0.1763  data_time: 0.0140  memory: 4710  grad_norm: 0.0000  loss: 208.7337  loss_cls: 85.1405  loss_bbox: 57.0307  loss_dfl: 66.5625
04/24 02:00:27 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:00:39 - mmengine - INFO - Epoch(train)  [15][ 50/290]  base_lr: 2.0000e-03 lr: 1.7426e-03  eta: 1:20:55  time: 0.2442  data_time: 0.0599  memory: 4750  grad_norm: 0.0000  loss: 208.3945  loss_cls: 84.8151  loss_bbox: 57.0169  loss_dfl: 66.5625
04/24 02:00:48 - mmengine - INFO - Epoch(train)  [15][100/290]  base_lr: 2.0000e-03 lr: 1.7426e-03  eta: 1:20:39  time: 0.1741  data_time: 0.0121  memory: 4376  grad_norm: 0.0000  loss: 208.8766  loss_cls: 85.1232  loss_bbox: 57.1910  loss_dfl: 66.5625
04/24 02:00:58 - mmengine - INFO - Epoch(train)  [15][150/290]  base_lr: 2.0000e-03 lr: 1.7426e-03  eta: 1:20:32  time: 0.2040  data_time: 0.0181  memory: 4590  grad_norm: 0.0000  loss: 209.0517  loss_cls: 84.9782  loss_bbox: 57.5111  loss_dfl: 66.5625
04/24 02:01:09 - mmengine - INFO - Epoch(train)  [15][200/290]  base_lr: 2.0000e-03 lr: 1.7426e-03  eta: 1:20:25  time: 0.2046  data_time: 0.0230  memory: 4564  grad_norm: 0.0000  loss: 209.3056  loss_cls: 85.0075  loss_bbox: 57.7356  loss_dfl: 66.5625
04/24 02:01:21 - mmengine - INFO - Epoch(train)  [15][250/290]  base_lr: 2.0000e-03 lr: 1.7426e-03  eta: 1:20:31  time: 0.2512  data_time: 0.0564  memory: 4416  grad_norm: 0.0000  loss: 209.2686  loss_cls: 85.1197  loss_bbox: 57.5864  loss_dfl: 66.5625
04/24 02:01:31 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:01:31 - mmengine - INFO - Saving checkpoint at 15 epochs
04/24 02:01:38 - mmengine - INFO - Epoch(val)  [15][  50/1160]    eta: 0:01:05  time: 0.0589  data_time: 0.0029  memory: 4496  
04/24 02:01:40 - mmengine - INFO - Epoch(val)  [15][ 100/1160]    eta: 0:00:54  time: 0.0442  data_time: 0.0024  memory: 597  
04/24 02:01:42 - mmengine - INFO - Epoch(val)  [15][ 150/1160]    eta: 0:00:46  time: 0.0357  data_time: 0.0012  memory: 597  
04/24 02:01:44 - mmengine - INFO - Epoch(val)  [15][ 200/1160]    eta: 0:00:44  time: 0.0471  data_time: 0.0045  memory: 597  
04/24 02:01:47 - mmengine - INFO - Epoch(val)  [15][ 250/1160]    eta: 0:00:43  time: 0.0518  data_time: 0.0045  memory: 597  
04/24 02:01:49 - mmengine - INFO - Epoch(val)  [15][ 300/1160]    eta: 0:00:40  time: 0.0456  data_time: 0.0017  memory: 597  
04/24 02:01:51 - mmengine - INFO - Epoch(val)  [15][ 350/1160]    eta: 0:00:36  time: 0.0335  data_time: 0.0009  memory: 597  
04/24 02:01:53 - mmengine - INFO - Epoch(val)  [15][ 400/1160]    eta: 0:00:34  time: 0.0433  data_time: 0.0007  memory: 597  
04/24 02:01:56 - mmengine - INFO - Epoch(val)  [15][ 450/1160]    eta: 0:00:32  time: 0.0528  data_time: 0.0018  memory: 597  
04/24 02:01:58 - mmengine - INFO - Epoch(val)  [15][ 500/1160]    eta: 0:00:29  time: 0.0403  data_time: 0.0016  memory: 597  
04/24 02:02:00 - mmengine - INFO - Epoch(val)  [15][ 550/1160]    eta: 0:00:27  time: 0.0457  data_time: 0.0019  memory: 597  
04/24 02:02:02 - mmengine - INFO - Epoch(val)  [15][ 600/1160]    eta: 0:00:25  time: 0.0471  data_time: 0.0036  memory: 597  
04/24 02:02:05 - mmengine - INFO - Epoch(val)  [15][ 650/1160]    eta: 0:00:23  time: 0.0473  data_time: 0.0035  memory: 597  
04/24 02:02:07 - mmengine - INFO - Epoch(val)  [15][ 700/1160]    eta: 0:00:20  time: 0.0431  data_time: 0.0027  memory: 597  
04/24 02:02:09 - mmengine - INFO - Epoch(val)  [15][ 750/1160]    eta: 0:00:18  time: 0.0405  data_time: 0.0014  memory: 597  
04/24 02:02:11 - mmengine - INFO - Epoch(val)  [15][ 800/1160]    eta: 0:00:16  time: 0.0479  data_time: 0.0021  memory: 597  
04/24 02:02:13 - mmengine - INFO - Epoch(val)  [15][ 850/1160]    eta: 0:00:13  time: 0.0422  data_time: 0.0011  memory: 597  
04/24 02:02:16 - mmengine - INFO - Epoch(val)  [15][ 900/1160]    eta: 0:00:11  time: 0.0440  data_time: 0.0019  memory: 597  
04/24 02:02:18 - mmengine - INFO - Epoch(val)  [15][ 950/1160]    eta: 0:00:09  time: 0.0458  data_time: 0.0009  memory: 597  
04/24 02:02:21 - mmengine - INFO - Epoch(val)  [15][1000/1160]    eta: 0:00:07  time: 0.0537  data_time: 0.0014  memory: 597  
04/24 02:02:23 - mmengine - INFO - Epoch(val)  [15][1050/1160]    eta: 0:00:04  time: 0.0368  data_time: 0.0008  memory: 597  
04/24 02:02:24 - mmengine - INFO - Epoch(val)  [15][1100/1160]    eta: 0:00:02  time: 0.0394  data_time: 0.0014  memory: 597  
04/24 02:02:27 - mmengine - INFO - Epoch(val)  [15][1150/1160]    eta: 0:00:00  time: 0.0587  data_time: 0.0041  memory: 597  
04/24 02:02:28 - mmengine - INFO - Evaluating bbox...
Loading and preparing results...
04/24 02:02:28 - mmengine - ERROR - /opt/conda/lib/python3.10/site-packages/mmdet/evaluation/metrics/coco_metric.py - compute_metrics - 469 - The testing results of the whole dataset is empty.
04/24 02:02:28 - mmengine - INFO - Epoch(val) [15][1160/1160]    data_time: 0.0021  time: 0.0455
04/24 02:02:43 - mmengine - INFO - Epoch(train)  [16][ 50/290]  base_lr: 2.0000e-03 lr: 1.7228e-03  eta: 1:20:54  time: 0.2946  data_time: 0.0972  memory: 4671  grad_norm: 0.0000  loss: 208.6796  loss_cls: 84.9651  loss_bbox: 57.1520  loss_dfl: 66.5625
04/24 02:02:51 - mmengine - INFO - Epoch(train)  [16][100/290]  base_lr: 2.0000e-03 lr: 1.7228e-03  eta: 1:20:37  time: 0.1723  data_time: 0.0100  memory: 4416  grad_norm: 0.0000  loss: 208.9665  loss_cls: 84.9839  loss_bbox: 57.4201  loss_dfl: 66.5625
04/24 02:03:00 - mmengine - INFO - Epoch(train)  [16][150/290]  base_lr: 2.0000e-03 lr: 1.7228e-03  eta: 1:20:22  time: 0.1783  data_time: 0.0146  memory: 4443  grad_norm: 0.0000  loss: 209.0466  loss_cls: 84.9119  loss_bbox: 57.5722  loss_dfl: 66.5625
04/24 02:03:09 - mmengine - INFO - Epoch(train)  [16][200/290]  base_lr: 2.0000e-03 lr: 1.7228e-03  eta: 1:20:08  time: 0.1792  data_time: 0.0081  memory: 4843  grad_norm: 0.0000  loss: 208.4161  loss_cls: 85.0224  loss_bbox: 56.8312  loss_dfl: 66.5625
04/24 02:03:19 - mmengine - INFO - Epoch(train)  [16][250/290]  base_lr: 2.0000e-03 lr: 1.7228e-03  eta: 1:19:57  time: 0.1959  data_time: 0.0164  memory: 4696  grad_norm: 0.0000  loss: 208.5272  loss_cls: 84.8246  loss_bbox: 57.1402  loss_dfl: 66.5625
04/24 02:03:26 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:03:38 - mmengine - INFO - Epoch(train)  [17][ 50/290]  base_lr: 2.0000e-03 lr: 1.7030e-03  eta: 1:19:44  time: 0.2323  data_time: 0.0627  memory: 4483  grad_norm: nan  loss: 209.1800  loss_cls: 85.1599  loss_bbox: 57.4576  loss_dfl: 66.5625
04/24 02:03:47 - mmengine - INFO - Epoch(train)  [17][100/290]  base_lr: 2.0000e-03 lr: 1.7030e-03  eta: 1:19:30  time: 0.1785  data_time: 0.0159  memory: 4310  grad_norm: 0.0000  loss: 208.8468  loss_cls: 85.0197  loss_bbox: 57.2646  loss_dfl: 66.5625
04/24 02:03:56 - mmengine - INFO - Epoch(train)  [17][150/290]  base_lr: 2.0000e-03 lr: 1.7030e-03  eta: 1:19:20  time: 0.1976  data_time: 0.0191  memory: 4564  grad_norm: 0.0000  loss: 209.0484  loss_cls: 85.0211  loss_bbox: 57.4648  loss_dfl: 66.5625
04/24 02:04:05 - mmengine - INFO - Epoch(train)  [17][200/290]  base_lr: 2.0000e-03 lr: 1.7030e-03  eta: 1:19:05  time: 0.1754  data_time: 0.0068  memory: 4483  grad_norm: 0.0000  loss: 208.7286  loss_cls: 85.0010  loss_bbox: 57.1650  loss_dfl: 66.5625
04/24 02:04:14 - mmengine - INFO - Epoch(train)  [17][250/290]  base_lr: 2.0000e-03 lr: 1.7030e-03  eta: 1:18:52  time: 0.1826  data_time: 0.0121  memory: 4564  grad_norm: 0.0000  loss: 209.3213  loss_cls: 85.0166  loss_bbox: 57.7422  loss_dfl: 66.5625
04/24 02:04:21 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:04:33 - mmengine - INFO - Epoch(train)  [18][ 50/290]  base_lr: 2.0000e-03 lr: 1.6832e-03  eta: 1:18:36  time: 0.2400  data_time: 0.0650  memory: 4656  grad_norm: 0.0000  loss: 209.3484  loss_cls: 85.1667  loss_bbox: 57.6192  loss_dfl: 66.5625
04/24 02:04:36 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:04:41 - mmengine - INFO - Epoch(train)  [18][100/290]  base_lr: 2.0000e-03 lr: 1.6832e-03  eta: 1:18:21  time: 0.1733  data_time: 0.0191  memory: 4603  grad_norm: 0.0000  loss: 208.7014  loss_cls: 84.9687  loss_bbox: 57.1701  loss_dfl: 66.5625
04/24 02:04:50 - mmengine - INFO - Epoch(train)  [18][150/290]  base_lr: 2.0000e-03 lr: 1.6832e-03  eta: 1:18:07  time: 0.1802  data_time: 0.0135  memory: 4616  grad_norm: 0.0000  loss: 208.5786  loss_cls: 84.9700  loss_bbox: 57.0462  loss_dfl: 66.5625
04/24 02:05:00 - mmengine - INFO - Epoch(train)  [18][200/290]  base_lr: 2.0000e-03 lr: 1.6832e-03  eta: 1:17:57  time: 0.1919  data_time: 0.0219  memory: 4656  grad_norm: 0.0000  loss: 208.4855  loss_cls: 85.0584  loss_bbox: 56.8646  loss_dfl: 66.5625
04/24 02:05:09 - mmengine - INFO - Epoch(train)  [18][250/290]  base_lr: 2.0000e-03 lr: 1.6832e-03  eta: 1:17:43  time: 0.1810  data_time: 0.0085  memory: 4603  grad_norm: 0.0000  loss: 208.9385  loss_cls: 84.8069  loss_bbox: 57.5691  loss_dfl: 66.5625
04/24 02:05:17 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:05:29 - mmengine - INFO - Epoch(train)  [19][ 50/290]  base_lr: 2.0000e-03 lr: 1.6634e-03  eta: 1:17:34  time: 0.2383  data_time: 0.0762  memory: 4483  grad_norm: 0.0000  loss: 208.8464  loss_cls: 84.9651  loss_bbox: 57.3188  loss_dfl: 66.5625
04/24 02:05:38 - mmengine - INFO - Epoch(train)  [19][100/290]  base_lr: 2.0000e-03 lr: 1.6634e-03  eta: 1:17:22  time: 0.1839  data_time: 0.0159  memory: 4816  grad_norm: 0.0000  loss: 208.7131  loss_cls: 84.9737  loss_bbox: 57.1768  loss_dfl: 66.5625
04/24 02:05:47 - mmengine - INFO - Epoch(train)  [19][150/290]  base_lr: 2.0000e-03 lr: 1.6634e-03  eta: 1:17:07  time: 0.1751  data_time: 0.0111  memory: 4603  grad_norm: 0.0000  loss: 209.0014  loss_cls: 84.9643  loss_bbox: 57.4746  loss_dfl: 66.5625
04/24 02:05:55 - mmengine - INFO - Epoch(train)  [19][200/290]  base_lr: 2.0000e-03 lr: 1.6634e-03  eta: 1:16:53  time: 0.1761  data_time: 0.0116  memory: 4470  grad_norm: 0.0000  loss: 208.7032  loss_cls: 84.9951  loss_bbox: 57.1456  loss_dfl: 66.5625
04/24 02:06:05 - mmengine - INFO - Epoch(train)  [19][250/290]  base_lr: 2.0000e-03 lr: 1.6634e-03  eta: 1:16:44  time: 0.1964  data_time: 0.0228  memory: 4656  grad_norm: 0.0000  loss: 208.4092  loss_cls: 85.0487  loss_bbox: 56.7980  loss_dfl: 66.5625
04/24 02:06:13 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:06:24 - mmengine - INFO - Epoch(train)  [20][ 50/290]  base_lr: 2.0000e-03 lr: 1.6436e-03  eta: 1:16:32  time: 0.2279  data_time: 0.0566  memory: 4830  grad_norm: 0.0000  loss: 208.7113  loss_cls: 84.9710  loss_bbox: 57.1777  loss_dfl: 66.5625
04/24 02:06:33 - mmengine - INFO - Epoch(train)  [20][100/290]  base_lr: 2.0000e-03 lr: 1.6436e-03  eta: 1:16:18  time: 0.1735  data_time: 0.0090  memory: 4590  grad_norm: 0.0000  loss: 208.7369  loss_cls: 85.0895  loss_bbox: 57.0849  loss_dfl: 66.5625
04/24 02:06:43 - mmengine - INFO - Epoch(train)  [20][150/290]  base_lr: 2.0000e-03 lr: 1.6436e-03  eta: 1:16:09  time: 0.2039  data_time: 0.0207  memory: 5030  grad_norm: 0.0000  loss: 208.9192  loss_cls: 85.0290  loss_bbox: 57.3277  loss_dfl: 66.5625
04/24 02:06:52 - mmengine - INFO - Epoch(train)  [20][200/290]  base_lr: 2.0000e-03 lr: 1.6436e-03  eta: 1:15:55  time: 0.1741  data_time: 0.0130  memory: 4603  grad_norm: 0.0000  loss: 208.9762  loss_cls: 85.0598  loss_bbox: 57.3539  loss_dfl: 66.5625
04/24 02:07:00 - mmengine - INFO - Epoch(train)  [20][250/290]  base_lr: 2.0000e-03 lr: 1.6436e-03  eta: 1:15:38  time: 0.1600  data_time: 0.0052  memory: 4536  grad_norm: 0.0000  loss: 208.7317  loss_cls: 84.9639  loss_bbox: 57.2053  loss_dfl: 66.5625
04/24 02:07:07 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:07:07 - mmengine - INFO - Saving checkpoint at 20 epochs
04/24 02:07:11 - mmengine - INFO - Epoch(val)  [20][  50/1160]    eta: 0:00:32  time: 0.0297  data_time: 0.0012  memory: 4496  
04/24 02:07:13 - mmengine - INFO - Epoch(val)  [20][ 100/1160]    eta: 0:00:31  time: 0.0299  data_time: 0.0010  memory: 597  
04/24 02:07:15 - mmengine - INFO - Epoch(val)  [20][ 150/1160]    eta: 0:00:32  time: 0.0365  data_time: 0.0014  memory: 597  
04/24 02:07:17 - mmengine - INFO - Epoch(val)  [20][ 200/1160]    eta: 0:00:31  time: 0.0338  data_time: 0.0009  memory: 597  
04/24 02:07:18 - mmengine - INFO - Epoch(val)  [20][ 250/1160]    eta: 0:00:28  time: 0.0254  data_time: 0.0005  memory: 597  
04/24 02:07:19 - mmengine - INFO - Epoch(val)  [20][ 300/1160]    eta: 0:00:25  time: 0.0213  data_time: 0.0004  memory: 597  
04/24 02:07:20 - mmengine - INFO - Epoch(val)  [20][ 350/1160]    eta: 0:00:23  time: 0.0272  data_time: 0.0020  memory: 597  
04/24 02:07:22 - mmengine - INFO - Epoch(val)  [20][ 400/1160]    eta: 0:00:22  time: 0.0338  data_time: 0.0010  memory: 597  
04/24 02:07:23 - mmengine - INFO - Epoch(val)  [20][ 450/1160]    eta: 0:00:21  time: 0.0286  data_time: 0.0008  memory: 597  
04/24 02:07:25 - mmengine - INFO - Epoch(val)  [20][ 500/1160]    eta: 0:00:19  time: 0.0349  data_time: 0.0010  memory: 597  
04/24 02:07:26 - mmengine - INFO - Epoch(val)  [20][ 550/1160]    eta: 0:00:18  time: 0.0253  data_time: 0.0006  memory: 597  
04/24 02:07:28 - mmengine - INFO - Epoch(val)  [20][ 600/1160]    eta: 0:00:16  time: 0.0319  data_time: 0.0005  memory: 597  
04/24 02:07:29 - mmengine - INFO - Epoch(val)  [20][ 650/1160]    eta: 0:00:15  time: 0.0300  data_time: 0.0012  memory: 597  
04/24 02:07:31 - mmengine - INFO - Epoch(val)  [20][ 700/1160]    eta: 0:00:13  time: 0.0247  data_time: 0.0009  memory: 597  
04/24 02:07:32 - mmengine - INFO - Epoch(val)  [20][ 750/1160]    eta: 0:00:12  time: 0.0296  data_time: 0.0010  memory: 597  
04/24 02:07:34 - mmengine - INFO - Epoch(val)  [20][ 800/1160]    eta: 0:00:10  time: 0.0292  data_time: 0.0008  memory: 597  
04/24 02:07:35 - mmengine - INFO - Epoch(val)  [20][ 850/1160]    eta: 0:00:09  time: 0.0273  data_time: 0.0014  memory: 597  
04/24 02:07:37 - mmengine - INFO - Epoch(val)  [20][ 900/1160]    eta: 0:00:07  time: 0.0338  data_time: 0.0008  memory: 597  
04/24 02:07:38 - mmengine - INFO - Epoch(val)  [20][ 950/1160]    eta: 0:00:06  time: 0.0229  data_time: 0.0014  memory: 597  
04/24 02:07:40 - mmengine - INFO - Epoch(val)  [20][1000/1160]    eta: 0:00:04  time: 0.0340  data_time: 0.0011  memory: 597  
04/24 02:07:41 - mmengine - INFO - Epoch(val)  [20][1050/1160]    eta: 0:00:03  time: 0.0282  data_time: 0.0005  memory: 597  
04/24 02:07:43 - mmengine - INFO - Epoch(val)  [20][1100/1160]    eta: 0:00:01  time: 0.0295  data_time: 0.0011  memory: 597  
04/24 02:07:44 - mmengine - INFO - Epoch(val)  [20][1150/1160]    eta: 0:00:00  time: 0.0355  data_time: 0.0016  memory: 597  
04/24 02:07:45 - mmengine - INFO - Evaluating bbox...
Loading and preparing results...
04/24 02:07:45 - mmengine - ERROR - /opt/conda/lib/python3.10/site-packages/mmdet/evaluation/metrics/coco_metric.py - compute_metrics - 469 - The testing results of the whole dataset is empty.
04/24 02:07:45 - mmengine - INFO - Epoch(val) [20][1160/1160]    data_time: 0.0010  time: 0.0297
04/24 02:07:55 - mmengine - INFO - Epoch(train)  [21][ 50/290]  base_lr: 2.0000e-03 lr: 1.6238e-03  eta: 1:15:20  time: 0.2167  data_time: 0.0588  memory: 4750  grad_norm: 0.0000  loss: 208.4572  loss_cls: 85.0767  loss_bbox: 56.8180  loss_dfl: 66.5625
04/24 02:08:05 - mmengine - INFO - Epoch(train)  [21][100/290]  base_lr: 2.0000e-03 lr: 1.6238e-03  eta: 1:15:08  time: 0.1841  data_time: 0.0152  memory: 4577  grad_norm: 0.0000  loss: 209.0636  loss_cls: 85.0925  loss_bbox: 57.4086  loss_dfl: 66.5625
04/24 02:08:13 - mmengine - INFO - Epoch(train)  [21][150/290]  base_lr: 2.0000e-03 lr: 1.6238e-03  eta: 1:14:53  time: 0.1676  data_time: 0.0060  memory: 4443  grad_norm: 0.0000  loss: 209.1516  loss_cls: 85.0927  loss_bbox: 57.4963  loss_dfl: 66.5625
04/24 02:08:23 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:08:23 - mmengine - INFO - Epoch(train)  [21][200/290]  base_lr: 2.0000e-03 lr: 1.6238e-03  eta: 1:14:42  time: 0.1913  data_time: 0.0171  memory: 4523  grad_norm: 0.0000  loss: 208.8522  loss_cls: 85.0758  loss_bbox: 57.2139  loss_dfl: 66.5625
04/24 02:08:31 - mmengine - INFO - Epoch(train)  [21][250/290]  base_lr: 2.0000e-03 lr: 1.6238e-03  eta: 1:14:28  time: 0.1692  data_time: 0.0117  memory: 4803  grad_norm: 0.0000  loss: 209.1729  loss_cls: 85.0918  loss_bbox: 57.5187  loss_dfl: 66.5625
04/24 02:08:37 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:08:49 - mmengine - INFO - Epoch(train)  [22][ 50/290]  base_lr: 2.0000e-03 lr: 1.6040e-03  eta: 1:14:12  time: 0.2349  data_time: 0.0747  memory: 4616  grad_norm: 0.0000  loss: 209.1220  loss_cls: 85.1554  loss_bbox: 57.4042  loss_dfl: 66.5625
04/24 02:08:58 - mmengine - INFO - Epoch(train)  [22][100/290]  base_lr: 2.0000e-03 lr: 1.6040e-03  eta: 1:13:58  time: 0.1720  data_time: 0.0159  memory: 4603  grad_norm: 0.0000  loss: 208.8812  loss_cls: 84.9711  loss_bbox: 57.3476  loss_dfl: 66.5625
04/24 02:09:07 - mmengine - INFO - Epoch(train)  [22][150/290]  base_lr: 2.0000e-03 lr: 1.6040e-03  eta: 1:13:46  time: 0.1818  data_time: 0.0124  memory: 4656  grad_norm: 0.0000  loss: 209.3741  loss_cls: 85.0804  loss_bbox: 57.7312  loss_dfl: 66.5625
04/24 02:09:16 - mmengine - INFO - Epoch(train)  [22][200/290]  base_lr: 2.0000e-03 lr: 1.6040e-03  eta: 1:13:33  time: 0.1776  data_time: 0.0132  memory: 4656  grad_norm: 0.0000  loss: 208.6347  loss_cls: 85.1128  loss_bbox: 56.9594  loss_dfl: 66.5625
04/24 02:09:24 - mmengine - INFO - Epoch(train)  [22][250/290]  base_lr: 2.0000e-03 lr: 1.6040e-03  eta: 1:13:19  time: 0.1691  data_time: 0.0042  memory: 4577  grad_norm: 0.0000  loss: 208.9543  loss_cls: 85.1298  loss_bbox: 57.2620  loss_dfl: 66.5625
04/24 02:09:30 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:09:42 - mmengine - INFO - Epoch(train)  [23][ 50/290]  base_lr: 2.0000e-03 lr: 1.5842e-03  eta: 1:13:03  time: 0.2389  data_time: 0.0685  memory: 4843  grad_norm: 0.0000  loss: 208.9363  loss_cls: 85.0100  loss_bbox: 57.3638  loss_dfl: 66.5625
04/24 02:09:51 - mmengine - INFO - Epoch(train)  [23][100/290]  base_lr: 2.0000e-03 lr: 1.5842e-03  eta: 1:12:50  time: 0.1721  data_time: 0.0099  memory: 4483  grad_norm: 0.0000  loss: 208.9689  loss_cls: 84.9381  loss_bbox: 57.4683  loss_dfl: 66.5625
04/24 02:10:00 - mmengine - INFO - Epoch(train)  [23][150/290]  base_lr: 2.0000e-03 lr: 1.5842e-03  eta: 1:12:36  time: 0.1684  data_time: 0.0103  memory: 4883  grad_norm: 0.0000  loss: 209.0491  loss_cls: 85.0182  loss_bbox: 57.4683  loss_dfl: 66.5625
04/24 02:10:08 - mmengine - INFO - Epoch(train)  [23][200/290]  base_lr: 2.0000e-03 lr: 1.5842e-03  eta: 1:12:22  time: 0.1671  data_time: 0.0064  memory: 4564  grad_norm: 0.0000  loss: 209.1266  loss_cls: 85.1896  loss_bbox: 57.3745  loss_dfl: 66.5625
04/24 02:10:17 - mmengine - INFO - Epoch(train)  [23][250/290]  base_lr: 2.0000e-03 lr: 1.5842e-03  eta: 1:12:10  time: 0.1837  data_time: 0.0105  memory: 4910  grad_norm: 0.0000  loss: 208.2432  loss_cls: 84.8034  loss_bbox: 56.8773  loss_dfl: 66.5625
04/24 02:10:24 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:10:35 - mmengine - INFO - Epoch(train)  [24][ 50/290]  base_lr: 2.0000e-03 lr: 1.5644e-03  eta: 1:11:55  time: 0.2201  data_time: 0.1276  memory: 4577  grad_norm: 0.0000  loss: 209.5880  loss_cls: 85.2076  loss_bbox: 57.8179  loss_dfl: 66.5625
04/24 02:10:45 - mmengine - INFO - Epoch(train)  [24][100/290]  base_lr: 2.0000e-03 lr: 1.5644e-03  eta: 1:11:46  time: 0.1960  data_time: 0.0311  memory: 4696  grad_norm: 0.0000  loss: 208.6001  loss_cls: 84.8341  loss_bbox: 57.2035  loss_dfl: 66.5625
04/24 02:10:54 - mmengine - INFO - Epoch(train)  [24][150/290]  base_lr: 2.0000e-03 lr: 1.5644e-03  eta: 1:11:32  time: 0.1715  data_time: 0.0128  memory: 4976  grad_norm: 0.0000  loss: 208.6136  loss_cls: 85.1773  loss_bbox: 56.8737  loss_dfl: 66.5625
04/24 02:11:02 - mmengine - INFO - Epoch(train)  [24][200/290]  base_lr: 2.0000e-03 lr: 1.5644e-03  eta: 1:11:18  time: 0.1651  data_time: 0.0151  memory: 4936  grad_norm: 0.0000  loss: 208.7838  loss_cls: 85.0152  loss_bbox: 57.2062  loss_dfl: 66.5625
04/24 02:11:11 - mmengine - INFO - Epoch(train)  [24][250/290]  base_lr: 2.0000e-03 lr: 1.5644e-03  eta: 1:11:07  time: 0.1854  data_time: 0.0121  memory: 4443  grad_norm: 0.0000  loss: 208.3897  loss_cls: 84.9978  loss_bbox: 56.8295  loss_dfl: 66.5625
04/24 02:11:18 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:11:28 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:11:30 - mmengine - INFO - Epoch(train)  [25][ 50/290]  base_lr: 2.0000e-03 lr: 1.5446e-03  eta: 1:10:52  time: 0.2248  data_time: 0.0592  memory: 4671  grad_norm: 0.0000  loss: 208.8650  loss_cls: 85.1059  loss_bbox: 57.1966  loss_dfl: 66.5625
04/24 02:11:38 - mmengine - INFO - Epoch(train)  [25][100/290]  base_lr: 2.0000e-03 lr: 1.5446e-03  eta: 1:10:40  time: 0.1743  data_time: 0.0080  memory: 4430  grad_norm: 0.0000  loss: 208.8574  loss_cls: 84.8932  loss_bbox: 57.4017  loss_dfl: 66.5625
04/24 02:11:47 - mmengine - INFO - Epoch(train)  [25][150/290]  base_lr: 2.0000e-03 lr: 1.5446e-03  eta: 1:10:27  time: 0.1710  data_time: 0.0085  memory: 4456  grad_norm: 0.0000  loss: 209.0664  loss_cls: 85.1000  loss_bbox: 57.4039  loss_dfl: 66.5625
04/24 02:11:56 - mmengine - INFO - Epoch(train)  [25][200/290]  base_lr: 2.0000e-03 lr: 1.5446e-03  eta: 1:10:16  time: 0.1838  data_time: 0.0111  memory: 4577  grad_norm: 0.0000  loss: 208.9330  loss_cls: 85.0132  loss_bbox: 57.3574  loss_dfl: 66.5625
04/24 02:12:05 - mmengine - INFO - Epoch(train)  [25][250/290]  base_lr: 2.0000e-03 lr: 1.5446e-03  eta: 1:10:04  time: 0.1821  data_time: 0.0143  memory: 4470  grad_norm: 0.0000  loss: 208.9774  loss_cls: 85.0665  loss_bbox: 57.3484  loss_dfl: 66.5625
04/24 02:12:12 - mmengine - INFO - Exp name: yolo_world_v2_l_vlpan_bn_2e-4_80e_8gpus_mask-refine_prompt_tuning_coco_20240424_014520
04/24 02:12:12 - mmengine - INFO - Saving checkpoint at 25 epochs
04/24 02:12:16 - mmengine - INFO - Epoch(val)  [25][  50/1160]    eta: 0:00:26  time: 0.0240  data_time: 0.0016  memory: 4750  
04/24 02:12:18 - mmengine - INFO - Epoch(val)  [25][ 100/1160]    eta: 0:00:27  time: 0.0286  data_time: 0.0008  memory: 597  
04/24 02:12:19 - mmengine - INFO - Epoch(val)  [25][ 150/1160]    eta: 0:00:28  time: 0.0306  data_time: 0.0004  memory: 597  
04/24 02:12:21 - mmengine - INFO - Epoch(val)  [25][ 200/1160]    eta: 0:00:25  time: 0.0239  data_time: 0.0006  memory: 597  
04/24 02:12:22 - mmengine - INFO - Epoch(val)  [25][ 250/1160]    eta: 0:00:25  time: 0.0328  data_time: 0.0027  memory: 597  
04/24 02:12:24 - mmengine - INFO - Epoch(val)  [25][ 300/1160]    eta: 0:00:23  time: 0.0262  data_time: 0.0004  memory: 597  
04/24 02:12:24 - mmengine - INFO - Epoch(val)  [25][ 350/1160]    eta: 0:00:21  time: 0.0181  data_time: 0.0003  memory: 597  
04/24 02:12:26 - mmengine - INFO - Epoch(val)  [25][ 400/1160]    eta: 0:00:19  time: 0.0231  data_time: 0.0004  memory: 597  
04/24 02:12:27 - mmengine - INFO - Epoch(val)  [25][ 450/1160]    eta: 0:00:18  time: 0.0283  data_time: 0.0014  memory: 597  
04/24 02:12:29 - mmengine - INFO - Epoch(val)  [25][ 500/1160]    eta: 0:00:17  time: 0.0322  data_time: 0.0010  memory: 597  
04/24 02:12:30 - mmengine - INFO - Epoch(val)  [25][ 550/1160]    eta: 0:00:16  time: 0.0369  data_time: 0.0011  memory: 597  
04/24 02:12:32 - mmengine - INFO - Epoch(val)  [25][ 600/1160]    eta: 0:00:15  time: 0.0263  data_time: 0.0004  memory: 597  
04/24 02:12:33 - mmengine - INFO - Epoch(val)  [25][ 650/1160]    eta: 0:00:13  time: 0.0201  data_time: 0.0004  memory: 597  
04/24 02:12:34 - mmengine - INFO - Epoch(val)  [25][ 700/1160]    eta: 0:00:12  time: 0.0300  data_time: 0.0005  memory: 597  
04/24 02:12:36 - mmengine - INFO - Epoch(val)  [25][ 750/1160]    eta: 0:00:11  time: 0.0274  data_time: 0.0006  memory: 597  
04/24 02:12:37 - mmengine - INFO - Epoch(val)  [25][ 800/1160]    eta: 0:00:09  time: 0.0278  data_time: 0.0009  memory: 597  
04/24 02:12:39 - mmengine - INFO - Epoch(val)  [25][ 850/1160]    eta: 0:00:08  time: 0.0327  data_time: 0.0011  memory: 597  
04/24 02:12:40 - mmengine - INFO - Epoch(val)  [25][ 900/1160]    eta: 0:00:07  time: 0.0255  data_time: 0.0005  memory: 597  
04/24 02:12:41 - mmengine - INFO - Epoch(val)  [25][ 950/1160]    eta: 0:00:05  time: 0.0202  data_time: 0.0005  memory: 597  
04/24 02:12:42 - mmengine - INFO - Epoch(val)  [25][1000/1160]    eta: 0:00:04  time: 0.0261  data_time: 0.0009  memory: 597  
04/24 02:12:44 - mmengine - INFO - Epoch(val)  [25][1050/1160]    eta: 0:00:03  time: 0.0330  data_time: 0.0015  memory: 597  
04/24 02:12:45 - mmengine - INFO - Epoch(val)  [25][1100/1160]    eta: 0:00:01  time: 0.0258  data_time: 0.0012  memory: 597  
04/24 02:12:47 - mmengine - INFO - Epoch(val)  [25][1150/1160]    eta: 0:00:00  time: 0.0282  data_time: 0.0006  memory: 597  
04/24 02:12:47 - mmengine - INFO - Evaluating bbox...
Loading and preparing results...
04/24 02:12:47 - mmengine - ERROR - /opt/conda/lib/python3.10/site-packages/mmdet/evaluation/metrics/coco_metric.py - compute_metrics - 469 - The testing results of the whole dataset is empty.

Hi @zhujiajian98 I also encountered the same validation set error, have you solved it?

chenjiafu-George commented 5 months ago
  • INFO - Epoch(train) [16][100/290] base_lr: 2.0000e-03 lr: 1.7228e-03 eta: 1:20:37 time: 0.1723 data_time: 0.0100 memory: 4416 grad_norm: 0.0000 loss: 208.9665 loss_cls: 84.9839 loss_b

Hi @lin-whale What datasets are you using, what is the approximate size of the data set, and why only one loss decreases after running my data set, while the other two losses remain unchanged.

lin-whale commented 5 months ago

@chenjiafu-George The origin issue was caused because not specify the pretrained Model weight. Do you offer that?

lin-whale commented 5 months ago
  • INFO - Epoch(train) [16][100/290] base_lr: 2.0000e-03 lr: 1.7228e-03 eta: 1:20:37 time: 0.1723 data_time: 0.0100 memory: 4416 grad_norm: 0.0000 loss: 208.9665 loss_cls: 84.9839 loss_b

Hi @lin-whale What datasets are you using, what is the approximate size of the data set, and why only one loss decreases after running my data set, while the other two losses remain unchanged.

My dataset is customed, and it works well on about 1000 images.

chenjiafu-George commented 5 months ago

@chenjiafu-George The origin issue was caused because not specify the pretrained Model weight. Do you offer that?

Hi @lin-whale Sorry, I can hardly help you with this problem, I didn't use this pretrain model.

pelinsuacar commented 4 months ago

@zhujiajian98 After setting load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth', I have solved the problem of loss not decreasing during training, and the predicents are informative now. Just it is not as good as fine-tuning directly which is about 70% AP while the other is about 90% AP.

Hi @lin-whale, can I ask where you found the pre-trained weights?

lin-whale commented 4 months ago

@zhujiajian98 After setting load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth', I have solved the problem of loss not decreasing during training, and the predicents are informative now. Just it is not as good as fine-tuning directly which is about 70% AP while the other is about 90% AP.

Hi @lin-whale, can I ask where you found the pre-trained weights?

here

pelinsuacar commented 4 months ago

@lin-whale Thank you! Another thing is that when you load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth', didn't you receive an error like that:

The model and loaded state dict do not match exactly size mismatch for embeddings: copying a param with shape torch.Size([80, 512]) from checkpoint, the shape in current model is torch.Size([50, 512])

lin-whale commented 4 months ago

@lin-whale Thank you! Another thing is that when you load_from = 'pretrained_models/yolo_world_l_clip_base_dual_vlpan_2e-3adamw_32xb16_100e_o365_goldg_cc3mlite_train_pretrained-7a5eea3b.pth', didn't you receive an error like that:

The model and loaded state dict do not match exactly size mismatch for embeddings: copying a param with shape torch.Size([80, 512]) from checkpoint, the shape in current model is torch.Size([50, 512])

It seems the reason is some mismatching in config. Here is my config working right. Or you can upload your config file for more details.

_base_ = (
    '../../third_party/mmyolo/configs/yolov8/'
    'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
custom_imports = dict(
    imports=['yolo_world'],
    allow_failed_imports=False)

# hyper-parameters
num_classes = 50
num_training_classes = 50
max_epochs = 80  # Maximum training epochs
close_mosaic_epochs = 10
save_epoch_intervals = 5
text_channels = 512
neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
base_lr = 2e-4
weight_decay = 0.05
train_batch_size_per_gpu = 64
load_from = 'pretrained_models/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth'
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name = 'openai/clip-vit-base-patch32'
persistent_workers = False
pelinsuacar commented 4 months ago

@lin-whale Here is my config file:

_base_ = ('../../third_party/mmyolo/configs/yolov8/'
          'yolov8_l_syncbn_fast_8xb16-500e_coco.py')
custom_imports = dict(imports=['yolo_world'], allow_failed_imports=False)

# hyper-parameters
num_classes = 1
num_training_classes = 1
max_epochs = 10  # Maximum training epochs
close_mosaic_epochs = 10
save_epoch_intervals = 5
text_channels = 512
neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
base_lr = 2e-3
weight_decay = 0.0005
train_batch_size_per_gpu = 4
#load_from = 'pretrained_models/yolo_world_l_clip_t2i_bn_2e-3adamw_32xb16-100e_obj365v1_goldg_cc3mlite_train-ca93cd1f.pth'
load_from = 'pretrained_models/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco_ep80-e1288152.pth'
persistent_workers = False

# model settings
model = dict(type='SimpleYOLOWorldDetector',
             mm_neck=True,
             num_train_classes=num_training_classes,
             num_test_classes=num_classes,
             embedding_path='embeddings/yoloworld_wheat_emb.npy',
             prompt_dim=text_channels,
             num_prompts=1,
             freeze_prompt=True,
             data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
             backbone=dict(_delete_=True,
                           type='MultiModalYOLOBackbone',
                           text_model=None,
                           image_model={{_base_.model.backbone}},
                           with_text_model=False),
             neck=dict(type='YOLOWorldPAFPN',
                       freeze_all=False,
                       guide_channels=text_channels,
                       embed_channels=neck_embed_channels,
                       num_heads=neck_num_heads,
                       block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
             bbox_head=dict(type='YOLOWorldHead',
                            head_module=dict(
                                type='YOLOWorldHeadModule',
                                freeze_all=False,
                                use_bn_head=True,
                                embed_dims=text_channels,
                                num_classes=num_training_classes)),
             train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
meta_info = dict(classes = ('wheat'))
# dataset settings
coco_train_dataset = dict(type='YOLOv5CocoDataset',
                          metainfo = meta_info,
                          data_root='data/wheat',
                          ann_file='annotations/wheat_coco_train_ann.json',
                          data_prefix=dict(img='wheat_train/'),
                          filter_cfg=dict(filter_empty_gt=False, min_size=32),
                          pipeline=_base_.train_pipeline)

train_dataloader = dict(persistent_workers=persistent_workers,
                        batch_size=train_batch_size_per_gpu,
                        collate_fn=dict(type='yolow_collate'),
                        dataset=coco_train_dataset)

coco_val_dataset = dict(type='YOLOv5CocoDataset',
                        metainfo = meta_info,
                        data_root='data/wheat',
                        ann_file='annotations/wheat_coco_val_ann.json',
                        data_prefix=dict(img='wheat_val/'),
                        filter_cfg=dict(filter_empty_gt=False, min_size=32),
                        pipeline=_base_.test_pipeline)

val_dataloader = dict(dataset=coco_val_dataset)
test_dataloader = val_dataloader
# training settings
default_hooks = dict(param_scheduler=dict(scheduler_type='linear',
                                          lr_factor=0.01,
                                          max_epochs=max_epochs),
                     checkpoint=dict(max_keep_ckpts=-1,
                                     save_best=None,
                                     interval=save_epoch_intervals))
custom_hooks = [
    dict(type='EMAHook',
         ema_type='ExpMomentumEMA',
         momentum=0.0001,
         update_buffers=True,
         strict_load=False,
         priority=49),
    dict(type='mmdet.PipelineSwitchHook',
         switch_epoch=max_epochs - close_mosaic_epochs,
         switch_pipeline=_base_.train_pipeline_stage2)
]
train_cfg = dict(max_epochs=max_epochs,
                 val_interval=5,
                 dynamic_intervals=[((max_epochs - close_mosaic_epochs),
                                     _base_.val_interval_stage2)])

optim_wrapper = dict(optimizer=dict(
    _delete_=True,
    type='SGD',
    lr=base_lr,
    momentum=0.937,
    nesterov=True,
    weight_decay=weight_decay,
    batch_size_per_gpu=train_batch_size_per_gpu))

# evaluation settings
val_evaluator = dict(_delete_=True,
                     type='mmdet.CocoMetric',
                     proposal_nums=(100, 1, 10),
                     ann_file='data/wheat/annotations/wheat_coco_val_ann.json',
                     metric='bbox',
                    classwise=True)

And my output:

Loads checkpoint by local backend from path: pretrained_models/yolo_world_v2_l_vlpan_bn_sgd_1e-3_40e_8gpus_finetune_coco_ep80-e1288152.pth The model and loaded state dict do not match exactly

size mismatch for embeddings: copying a param with shape torch.Size([80, 512]) from checkpoint, the shape in current model is torch.Size([512]). The model and loaded state dict do not match exactly

size mismatch for embeddings: copying a param with shape torch.Size([80, 512]) from checkpoint, the shape in current model is torch.Size([512]).

lin-whale commented 4 months ago

@pelinsuacar Maybe to adjust num_classes = 1 num_training_classes = 1 and see if output altered?

pelinsuacar commented 4 months ago

@pelinsuacar Maybe to adjust num_classes = 1 num_training_classes = 1 and see if output altered?

but they are already set as 1

lin-whale commented 4 months ago

@pelinsuacar It seems that coco datasets contains 80 categories and torch.Size([80, 512]) means that this checkpoint file is only used for coco dataset. Try this checkpoint? load_from = 'pretrained_models/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth'

lin-whale commented 4 months ago

@pelinsuacar Maybe to adjust num_classes = 1 num_training_classes = 1 and see if output altered?

but they are already set as 1

I mean modify these two parameters.

pelinsuacar commented 4 months ago

yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth

yes exactly but I could not find a suitable checkpoint. I tried yours but it gives even more error:

Loads checkpoint by local backend from path: pretrained_models/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth The model and loaded state dict do not match exactly

unexpected key in source state_dict: backbone.text_model.model.text_model.embeddings.token_embedding.weight, backbone.text_model.model.text_model.embeddings.position_embedding.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.0.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.0.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.0.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.1.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.1.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.1.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.2.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.2.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.2.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.3.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.3.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.3.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.4.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.4.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.4.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.5.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.5.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.5.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.6.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.6.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.6.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.7.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.7.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.7.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.8.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.8.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.8.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.9.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.9.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.9.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.10.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.10.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.10.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.11.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.11.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.11.layer_norm2.bias, backbone.text_model.model.text_model.final_layer_norm.weight, backbone.text_model.model.text_model.final_layer_norm.bias, backbone.text_model.model.text_projection.weight

missing keys in source state_dict: embeddings

The model and loaded state dict do not match exactly

unexpected key in source state_dict: backbone.text_model.model.text_model.embeddings.token_embedding.weight, backbone.text_model.model.text_model.embeddings.position_embedding.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.0.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.0.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.0.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.0.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.0.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.0.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.1.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.1.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.1.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.1.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.1.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.1.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.2.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.2.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.2.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.2.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.2.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.2.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.3.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.3.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.3.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.3.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.3.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.3.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.4.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.4.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.4.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.4.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.4.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.4.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.5.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.5.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.5.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.5.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.5.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.5.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.6.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.6.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.6.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.6.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.6.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.6.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.7.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.7.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.7.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.7.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.7.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.7.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.8.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.8.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.8.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.8.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.8.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.8.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.9.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.9.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.9.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.9.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.9.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.9.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.10.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.10.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.10.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.10.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.10.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.10.layer_norm2.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.k_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.k_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.v_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.v_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.q_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.q_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.self_attn.out_proj.weight, backbone.text_model.model.text_model.encoder.layers.11.self_attn.out_proj.bias, backbone.text_model.model.text_model.encoder.layers.11.layer_norm1.weight, backbone.text_model.model.text_model.encoder.layers.11.layer_norm1.bias, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc1.weight, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc1.bias, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc2.weight, backbone.text_model.model.text_model.encoder.layers.11.mlp.fc2.bias, backbone.text_model.model.text_model.encoder.layers.11.layer_norm2.weight, backbone.text_model.model.text_model.encoder.layers.11.layer_norm2.bias, backbone.text_model.model.text_model.final_layer_norm.weight, backbone.text_model.model.text_model.final_layer_norm.bias, backbone.text_model.model.text_projection.weight

missing keys in source state_dict: embeddings

05/30 10:45:01 - mmengine - INFO - Load checkpoint from pretrained_models/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth

This checkpoint has text model but in our case we don't. I couldn't understand how it does not give any error with your config. I tried with your config as well but same result :(

lin-whale commented 4 months ago

@pelinsuacar Maybe to adjust num_classes = 1 num_training_classes = 1 and see if output altered?

but they are already set as 1

I mean modify these two parameters.

@pelinsuacar Have you tried this? If output is different you can locate your problem. Use the checkpoint you used previously.

pelinsuacar commented 4 months ago

@pelinsuacar Maybe to adjust num_classes = 1 num_training_classes = 1 and see if output altered?

but they are already set as 1

I mean modify these two parameters.

@pelinsuacar Have you tried this? If output is different you can locate your problem.

yes it did not change anything cuz it checkes the size of the embedding array that I provide ('embeddings/yoloworld_wheat_emb.npy') which is (1,512) instead of (80,512)

@lin-whale so you were able to use this checkpoint "yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth" with your config that does not have any text model?

lin-whale commented 4 months ago

@pelinsuacar Maybe to adjust num_classes = 1 num_training_classes = 1 and see if output altered?

but they are already set as 1

I mean modify these two parameters.

@pelinsuacar Have you tried this? If output is different you can locate your problem.

yes it did not change anything cuz it checkes the size of the embedding array that I provide ('embeddings/yoloworld_wheat_emb.npy') which is (1,512) instead of (80,512)

@lin-whale so you were able to use this checkpoint "yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth" with your config that does not have any text model?

I add this line in config file. And add text model in the folder. text_model_name = 'openai/clip-vit-base-patch32' Access this here.

pelinsuacar commented 4 months ago

but isn't the important thing to define the model with text model also?

model = dict(type='YOLOWorldPromptDetector',
             mm_neck=True,
             num_train_classes=num_training_classes,
             num_test_classes=num_classes,
             embedding_path='embeddings/hospital_50.npy',
             prompt_dim=text_channels,
             num_prompts=50,
             data_preprocessor=dict(type='YOLOv5DetDataPreprocessor'),
             backbone=dict(_delete_=True,
                           type='MultiModalYOLOBackbone',
                           text_model=None,
                           image_model={{_base_.model.backbone}},
                           frozen_stages=4,
                           **with_text_model=False**),   ----> So did you also change this part to True? @lin-whale 
             neck=dict(type='YOLOWorldPAFPN',
                       freeze_all=True,
                       guide_channels=text_channels,
                       embed_channels=neck_embed_channels,
                       num_heads=neck_num_heads,
                       block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
             bbox_head=dict(type='YOLOWorldHead',
                            head_module=dict(
                                type='YOLOWorldHeadModule',
                                freeze_all=True,
                                use_bn_head=True,
                                embed_dims=text_channels,
                                num_classes=num_training_classes)),
             train_cfg=dict(assigner=dict(num_classes=num_training_classes)))
lin-whale commented 4 months ago

@pelinsuacar Sure. Here is full config file.

_base_ = (
    '../../third_party/mmyolo/configs/yolov8/'
    'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
custom_imports = dict(
    imports=['yolo_world'],
    allow_failed_imports=False)

# hyper-parameters
num_classes = 50
num_training_classes = 50
max_epochs = 80  # Maximum training epochs
close_mosaic_epochs = 10
save_epoch_intervals = 5
text_channels = 512
neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
base_lr = 2e-4
weight_decay = 0.05
train_batch_size_per_gpu = 64
load_from = 'pretrained_models/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth'
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name = 'openai/clip-vit-base-patch32'
persistent_workers = False

# model settings
model = dict(
    type='YOLOWorldDetector',
    mm_neck=True,
    num_train_classes=num_training_classes,
    num_test_classes=num_classes,
    data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
    backbone=dict(
        _delete_=True,
        type='MultiModalYOLOBackbone',
        image_model={{_base_.model.backbone}},
        text_model=dict(
            type='HuggingCLIPLanguageBackbone',
            model_name=text_model_name,
            frozen_modules=['all'])),
    neck=dict(type='YOLOWorldPAFPN',
              guide_channels=text_channels,
              embed_channels=neck_embed_channels,
              num_heads=neck_num_heads,
              block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
    bbox_head=dict(type='YOLOWorldHead',
                   head_module=dict(type='YOLOWorldHeadModule',
                                    use_bn_head=True,
                                    embed_dims=text_channels,
                                    num_classes=num_training_classes)),
    train_cfg=dict(assigner=dict(num_classes=num_training_classes)))

# dataset settings
text_transform = [
    dict(type='RandomLoadText',
         num_neg_samples=(num_classes, num_classes),
         max_num_samples=num_training_classes,
         padding_to_max=True,
         padding_value=''),
    dict(type='mmdet.PackDetInputs',
         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
                    'flip_direction', 'texts'))
]
mosaic_affine_transform = [
    dict(
        type='MultiModalMosaic',
        img_scale=_base_.img_scale,
        pad_val=114.0,
        pre_transform=_base_.pre_transform),
    dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
    dict(
        type='YOLOv5RandomAffine',
        max_rotate_degree=0.0,
        max_shear_degree=0.0,
        max_aspect_ratio=100.,
        scaling_ratio_range=(1 - _base_.affine_scale,
                             1 + _base_.affine_scale),
        # img_scale is (width, height)
        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
        border_val=(114, 114, 114),
        min_area_ratio=_base_.min_area_ratio,
        use_mask_refine=_base_.use_mask2refine)
]
train_pipeline = [
    *_base_.pre_transform,
    *mosaic_affine_transform,
    dict(
        type='YOLOv5MultiModalMixUp',
        prob=_base_.mixup_prob,
        pre_transform=[*_base_.pre_transform,
                       *mosaic_affine_transform]),
    *_base_.last_transform[:-1],
    *text_transform
]
train_pipeline_stage2 = [
    *_base_.train_pipeline_stage2[:-1],
    *text_transform
]

meta_info = dict(classes = ('floor', 'person', 'paper', 'bottle', 'paper cup', 'mask', 'thread', 'waiting bench', 'sturdy', 'plastic bag', 'table', 'packaging bag', 'door', 'carton box', 'sticker', 'screen', 'book', 'cotton ball', 'warning sign', 'rod', 'poster rack', 'vomit', 'blood', 'traffic cone', 'trash can', 'cart', 'rack', 'bag', 'flowerpot', 'medication', 'paper box', 'meal box', 'pericarp', 'hat', 'umbrella', 'drip stand', 'coffee stains', 'elevator entrance', 'escalator entrance', 'triage desk', 'registration machine', 'fire hydrant', 'hospital bed', 'milk stains', 'plinth', 'chair', 'wheel chair', 'swab', 'drinking cup', 'fallen leaves'))

coco_train_dataset = dict(
    _delete_=True,
    type='MultiModalDataset',
    dataset=dict(
        type='YOLOv5CocoDataset',
        metainfo = meta_info,
        # data_root='/data/cvat/train/2024-04-24-ann-cvat',
        # ann_file='/data/cvat/train/2024-04-24-ann-cvat/annotations/annotations_train.json',
        data_root='/data/cvat/train/bottle_train0429/2024-04-29-ann-cvat',
        ann_file='/data/cvat/train/bottle_train0429/2024-04-29-ann-cvat/annotations/annotations.json',
        data_prefix=dict(img='images/'),
        filter_cfg=dict(filter_empty_gt=False, min_size=32)),
    class_text_path='data/texts/hospital_class_texts.json',
    pipeline=train_pipeline)

train_dataloader = dict(
    persistent_workers=persistent_workers,
    batch_size=train_batch_size_per_gpu,
    collate_fn=dict(type='yolow_collate'),
    dataset=coco_train_dataset)
test_pipeline = [
    *_base_.test_pipeline[:-1],
    dict(type='LoadText'),
    dict(
        type='mmdet.PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                   'scale_factor', 'pad_param', 'texts'))
]

val_dataroot = "/data/cvat/train/bottle_test0429/2024-04-29-ann-cvat"
val_ann_file = "/data/cvat/train/bottle_test0429/2024-04-29-ann-cvat/annotations/annotations.json"

coco_val_dataset = dict(
    _delete_=True,
    type='MultiModalDataset',
    dataset=dict(
        type='YOLOv5CocoDataset',
        metainfo = meta_info,
        # data_root='/data/cvat/train/bottle/2024-04-28-ann-cvat',
        # ann_file='/data/cvat/train/bottle/2024-04-28-ann-cvat/annotations/annotations.json',
        data_root=val_dataroot,
        ann_file=val_ann_file,
        data_prefix=dict(img='images/'),
        filter_cfg=dict(filter_empty_gt=False, min_size=32)),
    class_text_path='data/texts/hospital_class_texts.json',
    pipeline=test_pipeline)
val_dataloader = dict(    
    persistent_workers=persistent_workers,
    batch_size=train_batch_size_per_gpu,
    dataset=coco_val_dataset)
test_dataloader = val_dataloader
# training settings
default_hooks = dict(
    param_scheduler=dict(
        scheduler_type='linear',
        lr_factor=0.01,
        max_epochs=max_epochs),
    checkpoint=dict(
        max_keep_ckpts=-1,
        save_best="coco/bbox_mAP_50",
        interval=save_epoch_intervals))
custom_hooks = [
    dict(
        type='EMAHook',
        ema_type='ExpMomentumEMA',
        momentum=0.0001,
        update_buffers=True,
        strict_load=False,
        priority=49),
    dict(
        type='mmdet.PipelineSwitchHook',
        switch_epoch=max_epochs - close_mosaic_epochs,
        switch_pipeline=train_pipeline_stage2)
]
train_cfg = dict(
    max_epochs=max_epochs,
    val_interval=5,
    dynamic_intervals=[((max_epochs - close_mosaic_epochs),
                        _base_.val_interval_stage2)])
optim_wrapper = dict(
    optimizer=dict(
        _delete_=True,
        type='AdamW',
        lr=base_lr,
        weight_decay=weight_decay,
        batch_size_per_gpu=train_batch_size_per_gpu),
    paramwise_cfg=dict(
        custom_keys={'backbone.text_model': dict(lr_mult=0.01),
                     'logit_scale': dict(weight_decay=0.0)}),
    constructor='YOLOWv5OptimizerConstructor')
# evaluation settings
val_evaluator = dict(
    _delete_=True,
    type='mmdet.CocoMetric',
    proposal_nums=(100, 1, 10),
    # ann_file='/data/cvat/train/bottle/2024-04-28-ann-cvat/annotations/annotations.json',
    ann_file=val_ann_file,
    metric='bbox',
    classwise=True)

# test settings
test_evaluator = dict(
    _delete_=True,
    type='mmdet.CocoMetric',
    proposal_nums=(100, 1, 10),
    ann_file=val_ann_file,
    # ann_file='/data/cvat/train/bottle/2024-04-28-ann-cvat/annotations/annotations.json',
    metric='bbox',
    classwise=True)
pelinsuacar commented 4 months ago

@pelinsuacar Sure. Here is full config file.

_base_ = (
    '../../third_party/mmyolo/configs/yolov8/'
    'yolov8_l_mask-refine_syncbn_fast_8xb16-500e_coco.py')
custom_imports = dict(
    imports=['yolo_world'],
    allow_failed_imports=False)

# hyper-parameters
num_classes = 50
num_training_classes = 50
max_epochs = 80  # Maximum training epochs
close_mosaic_epochs = 10
save_epoch_intervals = 5
text_channels = 512
neck_embed_channels = [128, 256, _base_.last_stage_out_channels // 2]
neck_num_heads = [4, 8, _base_.last_stage_out_channels // 2 // 32]
base_lr = 2e-4
weight_decay = 0.05
train_batch_size_per_gpu = 64
load_from = 'pretrained_models/yolo_world_v2_l_obj365v1_goldg_cc3mlite_pretrain-ca93cd1f.pth'
# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
text_model_name = 'openai/clip-vit-base-patch32'
persistent_workers = False

# model settings
model = dict(
    type='YOLOWorldDetector',
    mm_neck=True,
    num_train_classes=num_training_classes,
    num_test_classes=num_classes,
    data_preprocessor=dict(type='YOLOWDetDataPreprocessor'),
    backbone=dict(
        _delete_=True,
        type='MultiModalYOLOBackbone',
        image_model={{_base_.model.backbone}},
        text_model=dict(
            type='HuggingCLIPLanguageBackbone',
            model_name=text_model_name,
            frozen_modules=['all'])),
    neck=dict(type='YOLOWorldPAFPN',
              guide_channels=text_channels,
              embed_channels=neck_embed_channels,
              num_heads=neck_num_heads,
              block_cfg=dict(type='MaxSigmoidCSPLayerWithTwoConv')),
    bbox_head=dict(type='YOLOWorldHead',
                   head_module=dict(type='YOLOWorldHeadModule',
                                    use_bn_head=True,
                                    embed_dims=text_channels,
                                    num_classes=num_training_classes)),
    train_cfg=dict(assigner=dict(num_classes=num_training_classes)))

# dataset settings
text_transform = [
    dict(type='RandomLoadText',
         num_neg_samples=(num_classes, num_classes),
         max_num_samples=num_training_classes,
         padding_to_max=True,
         padding_value=''),
    dict(type='mmdet.PackDetInputs',
         meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
                    'flip_direction', 'texts'))
]
mosaic_affine_transform = [
    dict(
        type='MultiModalMosaic',
        img_scale=_base_.img_scale,
        pad_val=114.0,
        pre_transform=_base_.pre_transform),
    dict(type='YOLOv5CopyPaste', prob=_base_.copypaste_prob),
    dict(
        type='YOLOv5RandomAffine',
        max_rotate_degree=0.0,
        max_shear_degree=0.0,
        max_aspect_ratio=100.,
        scaling_ratio_range=(1 - _base_.affine_scale,
                             1 + _base_.affine_scale),
        # img_scale is (width, height)
        border=(-_base_.img_scale[0] // 2, -_base_.img_scale[1] // 2),
        border_val=(114, 114, 114),
        min_area_ratio=_base_.min_area_ratio,
        use_mask_refine=_base_.use_mask2refine)
]
train_pipeline = [
    *_base_.pre_transform,
    *mosaic_affine_transform,
    dict(
        type='YOLOv5MultiModalMixUp',
        prob=_base_.mixup_prob,
        pre_transform=[*_base_.pre_transform,
                       *mosaic_affine_transform]),
    *_base_.last_transform[:-1],
    *text_transform
]
train_pipeline_stage2 = [
    *_base_.train_pipeline_stage2[:-1],
    *text_transform
]

meta_info = dict(classes = ('floor', 'person', 'paper', 'bottle', 'paper cup', 'mask', 'thread', 'waiting bench', 'sturdy', 'plastic bag', 'table', 'packaging bag', 'door', 'carton box', 'sticker', 'screen', 'book', 'cotton ball', 'warning sign', 'rod', 'poster rack', 'vomit', 'blood', 'traffic cone', 'trash can', 'cart', 'rack', 'bag', 'flowerpot', 'medication', 'paper box', 'meal box', 'pericarp', 'hat', 'umbrella', 'drip stand', 'coffee stains', 'elevator entrance', 'escalator entrance', 'triage desk', 'registration machine', 'fire hydrant', 'hospital bed', 'milk stains', 'plinth', 'chair', 'wheel chair', 'swab', 'drinking cup', 'fallen leaves'))

coco_train_dataset = dict(
    _delete_=True,
    type='MultiModalDataset',
    dataset=dict(
        type='YOLOv5CocoDataset',
        metainfo = meta_info,
        # data_root='/data/cvat/train/2024-04-24-ann-cvat',
        # ann_file='/data/cvat/train/2024-04-24-ann-cvat/annotations/annotations_train.json',
        data_root='/data/cvat/train/bottle_train0429/2024-04-29-ann-cvat',
        ann_file='/data/cvat/train/bottle_train0429/2024-04-29-ann-cvat/annotations/annotations.json',
        data_prefix=dict(img='images/'),
        filter_cfg=dict(filter_empty_gt=False, min_size=32)),
    class_text_path='data/texts/hospital_class_texts.json',
    pipeline=train_pipeline)

train_dataloader = dict(
    persistent_workers=persistent_workers,
    batch_size=train_batch_size_per_gpu,
    collate_fn=dict(type='yolow_collate'),
    dataset=coco_train_dataset)
test_pipeline = [
    *_base_.test_pipeline[:-1],
    dict(type='LoadText'),
    dict(
        type='mmdet.PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                   'scale_factor', 'pad_param', 'texts'))
]

val_dataroot = "/data/cvat/train/bottle_test0429/2024-04-29-ann-cvat"
val_ann_file = "/data/cvat/train/bottle_test0429/2024-04-29-ann-cvat/annotations/annotations.json"

coco_val_dataset = dict(
    _delete_=True,
    type='MultiModalDataset',
    dataset=dict(
        type='YOLOv5CocoDataset',
        metainfo = meta_info,
        # data_root='/data/cvat/train/bottle/2024-04-28-ann-cvat',
        # ann_file='/data/cvat/train/bottle/2024-04-28-ann-cvat/annotations/annotations.json',
        data_root=val_dataroot,
        ann_file=val_ann_file,
        data_prefix=dict(img='images/'),
        filter_cfg=dict(filter_empty_gt=False, min_size=32)),
    class_text_path='data/texts/hospital_class_texts.json',
    pipeline=test_pipeline)
val_dataloader = dict(    
    persistent_workers=persistent_workers,
    batch_size=train_batch_size_per_gpu,
    dataset=coco_val_dataset)
test_dataloader = val_dataloader
# training settings
default_hooks = dict(
    param_scheduler=dict(
        scheduler_type='linear',
        lr_factor=0.01,
        max_epochs=max_epochs),
    checkpoint=dict(
        max_keep_ckpts=-1,
        save_best="coco/bbox_mAP_50",
        interval=save_epoch_intervals))
custom_hooks = [
    dict(
        type='EMAHook',
        ema_type='ExpMomentumEMA',
        momentum=0.0001,
        update_buffers=True,
        strict_load=False,
        priority=49),
    dict(
        type='mmdet.PipelineSwitchHook',
        switch_epoch=max_epochs - close_mosaic_epochs,
        switch_pipeline=train_pipeline_stage2)
]
train_cfg = dict(
    max_epochs=max_epochs,
    val_interval=5,
    dynamic_intervals=[((max_epochs - close_mosaic_epochs),
                        _base_.val_interval_stage2)])
optim_wrapper = dict(
    optimizer=dict(
        _delete_=True,
        type='AdamW',
        lr=base_lr,
        weight_decay=weight_decay,
        batch_size_per_gpu=train_batch_size_per_gpu),
    paramwise_cfg=dict(
        custom_keys={'backbone.text_model': dict(lr_mult=0.01),
                     'logit_scale': dict(weight_decay=0.0)}),
    constructor='YOLOWv5OptimizerConstructor')
# evaluation settings
val_evaluator = dict(
    _delete_=True,
    type='mmdet.CocoMetric',
    proposal_nums=(100, 1, 10),
    # ann_file='/data/cvat/train/bottle/2024-04-28-ann-cvat/annotations/annotations.json',
    ann_file=val_ann_file,
    metric='bbox',
    classwise=True)

# test settings
test_evaluator = dict(
    _delete_=True,
    type='mmdet.CocoMetric',
    proposal_nums=(100, 1, 10),
    ann_file=val_ann_file,
    # ann_file='/data/cvat/train/bottle/2024-04-28-ann-cvat/annotations/annotations.json',
    metric='bbox',
    classwise=True)

oh okay I want to get rid of the language model by giving image embeddings of each class instead of class names as text. Since at the beginning you were also using embeddings, I thought it is still the same but now I see that you are using YOLOWorldDetector instead of SimpleYOLOWorldDetector

2879982985 commented 3 weeks ago

model = dict(type='YOLOWorldPromptDetector',请问您的这个检测器是自己定义的吗

2879982985 commented 3 weeks ago

您好,YOLOWorldPromptDetector这个版本已经被弃用了我的代码中没有这个,您可以传我一份那个yolo_world.py看一下吗