open-mmlab / mmtracking

OpenMMLab Video Perception Toolbox. It supports Video Object Detection (VID), Multiple Object Tracking (MOT), Single Object Tracking (SOT), Video Instance Segmentation (VIS) with a unified framework.
https://mmtracking.readthedocs.io/en/latest/
Apache License 2.0
3.52k stars 588 forks source link

When training the ByteTrack using BDD100K dataset, detection accuracy downs to nearly zero after adding L1 Loss of YOLOX #791

Open zengwz opened 1 year ago

zengwz commented 1 year ago

I train the ByteTrack using official config, only modifying some hyperparameters and the input dataset. And it performs well before adding the L1 Loss. image After adding the L1 Loss, image the test result in the 4th epoch is: image

the config is:

img_scale = (640, 640)
model = dict(
    detector=dict(
        type='YOLOX',
        input_size=(640, 640),
        random_size_range=(18, 32),
        random_size_interval=10,
        backbone=dict(
            type='CSPDarknet', deepen_factor=1.33, widen_factor=1.25),
        neck=dict(
            type='YOLOXPAFPN',
            in_channels=[320, 640, 1280],
            out_channels=320,
            num_csp_blocks=4),
        bbox_head=dict(
            type='YOLOXHead',
            num_classes=8,
            in_channels=320,
            feat_channels=320),
        train_cfg=dict(
            assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
        test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.7)),
        init_cfg=dict(
            type='Pretrained',
            checkpoint=
            'https://download.openmmlab.com/mmdetection/v2.0/yolox/yolox_x_8x8_300e_coco/yolox_x_8x8_300e_coco_20211126_140254-1ef88d67.pth'
        )),
    type='ByteTrack',
    motion=dict(type='KalmanFilter'),
    tracker=dict(
        type='ByteTracker',
        obj_score_thrs=dict(high=0.6, low=0.1),
        init_track_thr=0.7,
        weight_iou_with_det_scores=True,
        match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
        num_frames_retain=30))
optimizer = dict(
    type='SGD',
    lr=0.001,
    momentum=0.9,
    weight_decay=0.0005,
    nesterov=True,
    paramwise_cfg=dict(norm_decay_mult=0.0, bias_decay_mult=0.0))
optimizer_config = dict(grad_clip=None)
checkpoint_config = dict(interval=1)
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
opencv_num_threads = 0
mp_start_method = 'fork'
samples_per_gpu = 2
train_pipeline = [
    dict(
        type='Mosaic',
        img_scale=(640, 640),
        pad_val=114.0,
        bbox_clip_border=False),
    dict(
        type='RandomAffine',
        scaling_ratio_range=(0.1, 2),
        border=(-320, -320),
        bbox_clip_border=False),
    dict(
        type='MixUp',
        img_scale=(640, 640),
        ratio_range=(0.8, 1.6),
        pad_val=114.0,
        bbox_clip_border=False),
    dict(type='YOLOXHSVRandomAug'),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(
        type='Resize',
        img_scale=(640, 640),
        keep_ratio=True,
        bbox_clip_border=False),
    dict(type='Pad', size_divisor=32, pad_val=dict(img=(114.0, 114.0, 114.0))),
    dict(type='FilterAnnotations', min_gt_bbox_wh=(1, 1), keep_empty=False),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(640, 640),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(
                type='Normalize',
                mean=[0.0, 0.0, 0.0],
                std=[1.0, 1.0, 1.0],
                to_rgb=False),
            dict(
                type='Pad',
                size_divisor=32,
                pad_val=dict(img=(114.0, 114.0, 114.0))),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='VideoCollect', keys=['img'])
        ])
]
data_root = '/data/BDD100K/data/bdd/'
dataset_type = 'CocoDataset'
classes = ('pedestrian', 'rider', 'car', 'bus', 'truck', 'bicycle',
           'motorcycle', 'train')
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=4,
    persistent_workers=True,
    train=dict(
        type='MultiImageMixDataset',
        dataset=dict(
            type='CocoDataset',
            ann_file=
            '/data/BDD100K/data/bdd/labels/box_track_20/box_track_train_cocofmt.json',
            img_prefix='/data/BDD100K/data/bdd/images/track/train/',
            classes=('pedestrian', 'rider', 'car', 'bus', 'truck', 'bicycle',
                     'motorcycle', 'train'),
            pipeline=[
                dict(type='LoadImageFromFile'),
                dict(type='LoadAnnotations', with_bbox=True)
            ],
            filter_empty_gt=False),
        pipeline=[
            dict(
                type='Mosaic',
                img_scale=(640, 640),
                pad_val=114.0,
                bbox_clip_border=False),
            dict(
                type='RandomAffine',
                scaling_ratio_range=(0.1, 2),
                border=(-320, -320),
                bbox_clip_border=False),
            dict(
                type='MixUp',
                img_scale=(640, 640),
                ratio_range=(0.8, 1.6),
                pad_val=114.0,
                bbox_clip_border=False),
            dict(type='YOLOXHSVRandomAug'),
            dict(type='RandomFlip', flip_ratio=0.5),
            dict(
                type='Resize',
                img_scale=(640, 640),
                keep_ratio=True,
                bbox_clip_border=False),
            dict(
                type='Pad',
                size_divisor=32,
                pad_val=dict(img=(114.0, 114.0, 114.0))),
            dict(
                type='FilterAnnotations',
                min_gt_bbox_wh=(1, 1),
                keep_empty=False),
            dict(type='DefaultFormatBundle'),
            dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
        ]),
    val=dict(
        type='CocoVideoDataset',
        ann_file=
        '/data/BDD100K/data/bdd/labels/box_track_20/box_track_val_cocofmt.json',
        img_prefix='/data/BDD100K/data/bdd/images/track/val/',
        ref_img_sampler=None,
        classes=('pedestrian', 'rider', 'car', 'bus', 'truck', 'bicycle',
                 'motorcycle', 'train'),
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(640, 640),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(
                        type='Normalize',
                        mean=[0.0, 0.0, 0.0],
                        std=[1.0, 1.0, 1.0],
                        to_rgb=False),
                    dict(
                        type='Pad',
                        size_divisor=32,
                        pad_val=dict(img=(114.0, 114.0, 114.0))),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='VideoCollect', keys=['img'])
                ])
        ]),
    test=dict(
        type='CocoVideoDataset',
        ann_file=
        '/data/BDD100K/data/bdd/labels/box_track_20/box_track_val_cocofmt.json',
        img_prefix='/data/BDD100K/data/bdd/images/track/val/',
        ref_img_sampler=None,
        classes=('pedestrian', 'rider', 'car', 'bus', 'truck', 'bicycle',
                 'motorcycle', 'train'),
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(640, 640),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(
                        type='Normalize',
                        mean=[0.0, 0.0, 0.0],
                        std=[1.0, 1.0, 1.0],
                        to_rgb=False),
                    dict(
                        type='Pad',
                        size_divisor=32,
                        pad_val=dict(img=(114.0, 114.0, 114.0))),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='VideoCollect', keys=['img'])
                ])
        ]))
total_epochs = 6
num_last_epochs = 2
interval = 1
lr_config = dict(
    policy='YOLOX',
    warmup='exp',
    by_epoch=False,
    warmup_by_epoch=True,
    warmup_ratio=1,
    warmup_iters=1,
    num_last_epochs=2,
    min_lr_ratio=0.05)
custom_hooks = [
    dict(type='YOLOXModeSwitchHook', num_last_epochs=2, priority=48),
    dict(type='SyncNormHook', num_last_epochs=2, interval=1, priority=48),
    dict(
        type='ExpMomentumEMAHook',
        resume_from=None,
        momentum=0.0001,
        priority=49)
]
evaluation = dict(metric=['bbox', 'track'], interval=1)
search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
fp16 = dict(loss_scale=dict(init_scale=512.0))
work_dir = './work_dirs/bytetrack_yolox_x_bdd100k'
gpu_ids = [0]
dyhBUPT commented 1 year ago

It seems that your model doesn't converge well. How did you add L1 loss?

zengwz commented 1 year ago

According to my understanding, it should depend on "YOLOXModeSwitchHook". It will close the mosaic and mixup in the last num epochs and add L1 LOSS. the concrete implementation depends on the official implementation code of yolox.

image
yinxingshu41 commented 9 months ago

Could you share how you processed the BDD100K dataset in mmtracking and where did you download the BDD100K dataset? Thank you very much!

zengwz commented 9 months ago

Could you share how you processed the BDD100K dataset in mmtracking and where did you download the BDD100K dataset? Thank you very much!

I processed BDD100K dataset follow QDTrack(https://github.com/SysCV/qdtrack). And you can get the BDD100K dataset from its official website.