open-mmlab / mmtracking

OpenMMLab Video Perception Toolbox. It supports Video Object Detection (VID), Multiple Object Tracking (MOT), Single Object Tracking (SOT), Video Instance Segmentation (VIS) with a unified framework.
https://mmtracking.readthedocs.io/en/latest/
Apache License 2.0
3.52k stars 591 forks source link

TypeError: forward_train() got an unexpected keyword argument 'ref_img_metas' #529

Open 376498485 opened 2 years ago

376498485 commented 2 years ago

Hi, When I use vfnet(VarifocalNet) in mmdetection and bytetrack, I meet the following errors: image My config is as follows:

model = dict(
    detector=dict(
        type='VFNet',
        backbone=dict(
            type='ResNet',
            depth=101,
            num_stages=4,
            out_indices=(0, 1, 2, 3),
            frozen_stages=1,
            norm_cfg=dict(type='BN', requires_grad=True),
            norm_eval=True,
            style='pytorch',
            init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet101'),
            dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
            stage_with_dcn=(False, True, True, True)),
        neck=dict(
            type='FPN',
            in_channels=[256, 512, 1024, 2048],
            out_channels=256,
            start_level=1,
            add_extra_convs='on_output',
            num_outs=5,
            relu_before_extra_convs=True),
        bbox_head=dict(
            type='VFNetHead',
            num_classes=1,
            in_channels=256,
            stacked_convs=3,
            feat_channels=256,
            strides=[8, 16, 32, 64, 128],
            center_sampling=False,
            dcn_on_last_conv=True,
            use_atss=True,
            use_vfl=True,
            loss_cls=dict(
                type='VarifocalLoss',
                use_sigmoid=True,
                alpha=0.75,
                gamma=2.0,
                iou_weighted=True,
                loss_weight=1.0),
            loss_bbox=dict(type='GIoULoss', loss_weight=1.5),
            loss_bbox_refine=dict(type='GIoULoss', loss_weight=2.0)),
        init_cfg=dict(type='Pretrained', checkpoint='https://download.openmmlab.com/mmdetection/v2.0/vfnet/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco/vfnet_r101_fpn_mdconv_c3-c5_mstrain_2x_coco_20201027pth-7729adb5.pth'),
        train_cfg=dict(
            assigner=dict(type='ATSSAssigner', topk=9),
            allowed_border=-1,
            pos_weight=-1,
            debug=False),
        test_cfg=dict(
            nms_pre=1000,
            min_bbox_size=0,
            score_thr=0.05,
            nms=dict(type='nms', iou_threshold=0.6),
            max_per_img=100)
        ),

    type='ByteTrack',
    motion=dict(type='KalmanFilter'),
    tracker=dict(
        type='ByteTracker',
        obj_score_thrs=dict(high=0.6, low=0.1),
        init_track_thr=0.7,
        weight_iou_with_det_scores=True,
        match_iou_thrs=dict(high=0.1, low=0.5, tentative=0.3),
        num_frames_retain=30))
dataset_type = 'MOTChallengeDataset'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadMultiImagesFromFile', to_float32=True),
    dict(type='SeqLoadAnnotations', with_bbox=True, with_track=True),
    dict(
        type='SeqResize',
        img_scale=(1088, 1088),
        multiscale_mode='range',
        share_params=True,
        ratio_range=(0.8, 1.2),
        keep_ratio=True,
        bbox_clip_border=False),
    dict(type='SeqPhotoMetricDistortion', share_params=True),
    dict(
        type='SeqRandomCrop',
        share_params=False,
        crop_size=(1088, 1088),
        bbox_clip_border=False),
    dict(type='SeqRandomFlip', share_params=True, flip_ratio=0.5),
    dict(
        type='SeqNormalize',
        mean=[123.675, 116.28, 103.53],
        std=[58.395, 57.12, 57.375],
        to_rgb=True),
    dict(type='SeqPad', size_divisor=32),
    dict(type='MatchInstances', skip_nomatch=True),
    dict(
        type='VideoCollect',
        keys=[
            'img', 'gt_bboxes', 'gt_labels',
        ]),
    dict(type='SeqDefaultFormatBundle', ref_prefix='ref')
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1088, 1088),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375],
                to_rgb=True),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='VideoCollect', keys=['img'])
        ])
]
data_root = 'data/MOT17/'
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='MOTChallengeDataset',
        visibility_thr=-1,
        ann_file='data/MOT17/annotations/half-train_cocoformat.json',
        img_prefix='data/MOT17/train',
        ref_img_sampler=dict(
            num_ref_imgs=1,
            frame_range=10,
            filter_key_img=True,
            method='uniform'),
        pipeline=train_pipeline),
    val=dict(
        type='MOTChallengeDataset',
        ann_file='data/MOT17/annotations/half-val_cocoformat.json',
        img_prefix='data/MOT17/train',
        ref_img_sampler=None,
        pipeline=test_pipeline),
    test=dict(
        type='MOTChallengeDataset',
        ann_file='data/MOT17/annotations/half-val_cocoformat.json',
        img_prefix='data/MOT17/train',
        ref_img_sampler=None,
        pipeline=test_pipeline))
evaluation = dict(metric=['bbox', 'track'], interval=1)
optimizer = dict(
    type='SGD',
    lr=0.01,
    momentum=0.9,
    weight_decay=0.0001,
    paramwise_cfg=dict(bias_lr_mult=2.0, bias_decay_mult=0.0))
optimizer_config = dict(grad_clip=None)
lr_config = dict(
    policy='step',
    warmup='linear',
    warmup_iters=500,
    warmup_ratio=0.1,
    step=[16, 22])
total_epochs = 50
checkpoint_config = dict(interval=1)
log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')])
custom_hooks = [dict(type='NumClassCheckHook')]
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
opencv_num_threads = 0
mp_start_method = 'fork'
search_metrics = ['MOTA', 'IDF1', 'FN', 'FP', 'IDs', 'MT', 'ML']
work_dir = './work_dirs/bytetrack_vfnet'
gpu_ids = [0]

I will very appreciate it if you can help me.

dyhBUPT commented 2 years ago

Hi, do you use the latest version of mmdet?

376498485 commented 2 years ago

Hi, do you use the latest version of mmdet?

My mmdet's version is 2.24. Maybe not all detectors in mmdet can be used with BYTE in mmtrack.

dyhBUPT commented 2 years ago

Hi, do you use the latest version of mmdet?

My mmdet's version is 2.24. Maybe not all detectors in mmdet can be used with BYTE in mmtrack.

Have you tried other detectors? And do they work?

376498485 commented 2 years ago

Hi, do you use the latest version of mmdet?

My mmdet's version is 2.24. Maybe not all detectors in mmdet can be used with BYTE in mmtrack.

Have you tried other detectors? And do they work?

I have try Sparse RCNN, and it works. But deformable_detrtood not work. I just modify the detector in config files. I think RCNN-like models in mmtrack can be used with bytetrack.

dyhBUPT commented 2 years ago

Mayby you should set the "ref_img_sampler=None" in the "data" iterm. image

pixeli99 commented 2 years ago

Hi, lijinrun I think there is something wrong with your data configuration file. Please use the following configuration, which is equivalent to mmtracking/configs/_base_/datasets/mot_challenge_det.py. When we train bytetrack, we don't need the ref_img, so it will report the error you pointed out. Using the following configuration file, it can be compatible with any detector.

dataset_type = 'CocoDataset'
img_norm_cfg = dict(
    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
    dict(type='LoadImageFromFile', to_float32=True, file_client_args=file_client_args,),
    dict(type='LoadAnnotations', with_bbox=True,file_client_args=file_client_args, ),
    dict(
        type='Resize',
        img_scale=(1088, 1088),
        ratio_range=(0.8, 1.2),
        keep_ratio=True,
        bbox_clip_border=False),
    dict(type='PhotoMetricDistortion'),
    dict(type='RandomCrop', crop_size=(1088, 1088), bbox_clip_border=False),
    dict(type='RandomFlip', flip_ratio=0.5),
    dict(type='Normalize', **img_norm_cfg),
    dict(type='Pad', size_divisor=32),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
]
test_pipeline = [
    dict(type='LoadImageFromFile',file_client_args=file_client_args, ),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(1088, 1088),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='Normalize', **img_norm_cfg),
            dict(type='Pad', size_divisor=32),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img'])
        ])
]
data_root = 'data/MOT17/'
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type=dataset_type,
        file_client_args=file_client_args,
        ann_file=data_root + 'annotations/half-train_cocoformat.json',
        img_prefix=data_root + 'train',
        classes=('pedestrian', ),
        pipeline=train_pipeline),
    val=dict(
        type=dataset_type,
        file_client_args=file_client_args,
        ann_file=data_root + 'annotations/half-val_cocoformat.json',
        img_prefix=data_root + 'train',
        classes=('pedestrian', ),
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        file_client_args=file_client_args,
        ann_file=data_root + 'annotations/half-val_cocoformat.json',
        img_prefix=data_root + 'train',
        classes=('pedestrian', ),
        pipeline=test_pipeline))
pixeli99 commented 2 years ago

Hi, do you use the latest version of mmdet?

My mmdet's version is 2.24. Maybe not all detectors in mmdet can be used with BYTE in mmtrack.

Have you tried other detectors? And do they work?

I have try Sparse RCNN, and it works. But deformable_detrtood not work. I just modify the detector in config files. I think RCNN-like models in mmtrack can be used with bytetrack.

You said that it can run on sparse_rcnn, because the implementation of mmdet supports any parameters. image