open-mmlab / mmyolo

OpenMMLab YOLO series toolbox and benchmark. Implemented RTMDet, RTMDet-Rotated,YOLOv5, YOLOv6, YOLOv7, YOLOv8,YOLOX, PPYOLOE, etc.
https://mmyolo.readthedocs.io/zh_CN/dev/
GNU General Public License v3.0
2.97k stars 537 forks source link

mmdet.FocalLoss training error #337

Closed diplomatist closed 1 year ago

diplomatist commented 1 year ago

Prerequisite

🐞 Describe the bug

我将yolov7的head的loss_cls和loss_obj修改为type='mmdet.FocalLoss'时,出现了bug:assert input.dim()==2 AssertionError

# config代码
default_scope = 'mmyolo'
default_hooks = dict(
    timer=dict(type='IterTimerHook'),
    logger=dict(type='LoggerHook', interval=50),
    param_scheduler=dict(
        type='YOLOv5ParamSchedulerHook',
        scheduler_type='cosine',
        lr_factor=0.1,
        max_epochs=300),
    checkpoint=dict(
        type='CheckpointHook',
        interval=1,
        save_param_scheduler=False,
        save_best='coco/bbox_mAP_50',
        max_keep_ckpts=3),
    sampler_seed=dict(type='DistSamplerSeedHook'),
    visualization=dict(type='mmdet.DetVisualizationHook'))
env_cfg = dict(
    cudnn_benchmark=True,
    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
    dist_cfg=dict(backend='nccl'))
vis_backends = [dict(type='LocalVisBackend')]
visualizer = dict(
    type='mmdet.DetLocalVisualizer',
    vis_backends=[dict(type='LocalVisBackend')],
    name='visualizer')
log_processor = dict(type='LogProcessor', window_size=50, by_epoch=True)
log_level = 'INFO'
load_from = './work_dirs/yolov7_x_syncbn_fast_8x16b-300e_coco_20221124_215331-ef949a68.pth'
resume = False
file_client_args = dict(backend='disk')
data_root = 'data/banana/'
dataset_type = 'YOLOv5CocoDataset'
img_scale = (640, 640)
max_epochs = 300
save_epoch_intervals = 1
train_batch_size_per_gpu = 8
train_num_workers = 8
persistent_workers = True
val_batch_size_per_gpu = 1
val_num_workers = 2
mixup_p = 0.38
metainfo = dict(CLASSES=('0', ), PALETTE=[(255, 0, 0)])
batch_shapes_cfg = dict(
    type='BatchShapePolicy',
    batch_size=1,
    img_size=640,
    size_divisor=32,
    extra_pad_ratio=0.5)
anchors = [[(12, 16), (19, 36), (40, 28)], [(36, 75), (76, 55), (72, 146)],
           [(142, 110), (192, 243), (459, 401)]]
strides = [8, 16, 32]
num_det_layers = 3
num_classes = 1
model = dict(
    type='YOLODetector',
    data_preprocessor=dict(
        type='YOLOv5DetDataPreprocessor',
        mean=[0.0, 0.0, 0.0],
        std=[255.0, 255.0, 255.0],
        bgr_to_rgb=True),
    backbone=dict(
        type='YOLOv7Backbone',
        arch='X',
        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
        act_cfg=dict(type='SiLU', inplace=True)),
    neck=dict(
        type='YOLOv7PAFPN',
        block_cfg=dict(
            type='ELANBlock',
            middle_ratio=0.4,
            block_ratio=0.4,
            num_blocks=3,
            num_convs_in_block=2),
        upsample_feats_cat_first=False,
        in_channels=[640, 1280, 1280],
        out_channels=[160, 320, 640],
        norm_cfg=dict(type='BN', momentum=0.03, eps=0.001),
        act_cfg=dict(type='SiLU', inplace=True),
        use_repconv_outs=False),
    bbox_head=dict(
        type='YOLOv7Head',
        head_module=dict(
            type='YOLOv7HeadModule',
            num_classes=1,
            in_channels=[320, 640, 1280],
            featmap_strides=[8, 16, 32],
            num_base_priors=3),
        prior_generator=dict(
            type='mmdet.YOLOAnchorGenerator',
            base_sizes=[[(12, 16), (19, 36), (40, 28)],
                        [(36, 75), (76, 55), (72, 146)],
                        [(142, 110), (192, 243), (459, 401)]],
            strides=[8, 16, 32]),
        loss_cls=dict(
            type='mmdet.FocalLoss',
            use_sigmoid=True,
            reduction='mean',
            loss_weight=0.0037500000000000007),
        loss_bbox=dict(
            type='IoULoss',
            iou_mode='ciou',
            bbox_format='xywh',
            reduction='mean',
            loss_weight=0.05,
            return_iou=True),
        loss_obj=dict(
            type='mmdet.FocalLoss',
            use_sigmoid=True,
            reduction='mean',
            loss_weight=0.7),
        obj_level_weights=[4.0, 1.0, 0.4],
        prior_match_thr=4.0,
        simota_candidate_topk=10,
        simota_iou_weight=3.0,
        simota_cls_weight=1.0),
    test_cfg=dict(
        multi_label=True,
        nms_pre=30000,
        score_thr=0.001,
        nms=dict(type='nms', iou_threshold=0.65),
        max_per_img=300))
pre_transform = [
    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
    dict(type='LoadAnnotations', with_bbox=True)
]
mosiac4_pipeline = [
    dict(
        type='Mosaic',
        img_scale=(640, 640),
        pad_val=114.0,
        pre_transform=[
            dict(
                type='LoadImageFromFile',
                file_client_args=dict(backend='disk')),
            dict(type='LoadAnnotations', with_bbox=True)
        ]),
    dict(
        type='YOLOv5RandomAffine',
        max_rotate_degree=0.0,
        max_shear_degree=0.0,
        max_translate_ratio=0.2,
        scaling_ratio_range=(0.1, 2.0),
        border=(-320, -320),
        border_val=(114, 114, 114))
]
mosiac9_pipeline = [
    dict(
        type='Mosaic9',
        img_scale=(640, 640),
        pad_val=114.0,
        pre_transform=[
            dict(
                type='LoadImageFromFile',
                file_client_args=dict(backend='disk')),
            dict(type='LoadAnnotations', with_bbox=True)
        ]),
    dict(
        type='YOLOv5RandomAffine',
        max_rotate_degree=0.0,
        max_shear_degree=0.0,
        max_translate_ratio=0.2,
        scaling_ratio_range=(0.1, 2.0),
        border=(-320, -320),
        border_val=(114, 114, 114))
]
randchoice_mosaic_pipeline = dict(
    type='RandomChoice',
    transforms=[[{
        'type':
        'Mosaic',
        'img_scale': (640, 640),
        'pad_val':
        114.0,
        'pre_transform': [{
            'type': 'LoadImageFromFile',
            'file_client_args': {
                'backend': 'disk'
            }
        }, {
            'type': 'LoadAnnotations',
            'with_bbox': True
        }]
    }, {
        'type': 'YOLOv5RandomAffine',
        'max_rotate_degree': 0.0,
        'max_shear_degree': 0.0,
        'max_translate_ratio': 0.2,
        'scaling_ratio_range': (0.1, 2.0),
        'border': (-320, -320),
        'border_val': (114, 114, 114)
    }],
                [{
                    'type':
                    'Mosaic9',
                    'img_scale': (640, 640),
                    'pad_val':
                    114.0,
                    'pre_transform': [{
                        'type': 'LoadImageFromFile',
                        'file_client_args': {
                            'backend': 'disk'
                        }
                    }, {
                        'type': 'LoadAnnotations',
                        'with_bbox': True
                    }]
                }, {
                    'type': 'YOLOv5RandomAffine',
                    'max_rotate_degree': 0.0,
                    'max_shear_degree': 0.0,
                    'max_translate_ratio': 0.2,
                    'scaling_ratio_range': (0.1, 2.0),
                    'border': (-320, -320),
                    'border_val': (114, 114, 114)
                }]],
    prob=[0.8, 0.2])
train_pipeline = [
    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
    dict(type='LoadAnnotations', with_bbox=True),
    dict(
        type='RandomChoice',
        transforms=[[{
            'type':
            'Mosaic',
            'img_scale': (640, 640),
            'pad_val':
            114.0,
            'pre_transform': [{
                'type': 'LoadImageFromFile',
                'file_client_args': {
                    'backend': 'disk'
                }
            }, {
                'type': 'LoadAnnotations',
                'with_bbox': True
            }]
        }, {
            'type': 'YOLOv5RandomAffine',
            'max_rotate_degree': 0.0,
            'max_shear_degree': 0.0,
            'max_translate_ratio': 0.2,
            'scaling_ratio_range': (0.1, 2.0),
            'border': (-320, -320),
            'border_val': (114, 114, 114)
        }],
                    [{
                        'type':
                        'Mosaic9',
                        'img_scale': (640, 640),
                        'pad_val':
                        114.0,
                        'pre_transform': [{
                            'type': 'LoadImageFromFile',
                            'file_client_args': {
                                'backend': 'disk'
                            }
                        }, {
                            'type': 'LoadAnnotations',
                            'with_bbox': True
                        }]
                    }, {
                        'type': 'YOLOv5RandomAffine',
                        'max_rotate_degree': 0.0,
                        'max_shear_degree': 0.0,
                        'max_translate_ratio': 0.2,
                        'scaling_ratio_range': (0.1, 2.0),
                        'border': (-320, -320),
                        'border_val': (114, 114, 114)
                    }]],
        prob=[0.8, 0.2]),
    dict(
        type='YOLOv5MixUp',
        alpha=8.0,
        beta=8.0,
        prob=0.38,
        pre_transform=[
            dict(
                type='LoadImageFromFile',
                file_client_args=dict(backend='disk')),
            dict(type='LoadAnnotations', with_bbox=True),
            dict(
                type='RandomChoice',
                transforms=[[{
                    'type':
                    'Mosaic',
                    'img_scale': (640, 640),
                    'pad_val':
                    114.0,
                    'pre_transform': [{
                        'type': 'LoadImageFromFile',
                        'file_client_args': {
                            'backend': 'disk'
                        }
                    }, {
                        'type': 'LoadAnnotations',
                        'with_bbox': True
                    }]
                }, {
                    'type': 'YOLOv5RandomAffine',
                    'max_rotate_degree': 0.0,
                    'max_shear_degree': 0.0,
                    'max_translate_ratio': 0.2,
                    'scaling_ratio_range': (0.1, 2.0),
                    'border': (-320, -320),
                    'border_val': (114, 114, 114)
                }],
                            [{
                                'type':
                                'Mosaic9',
                                'img_scale': (640, 640),
                                'pad_val':
                                114.0,
                                'pre_transform': [{
                                    'type': 'LoadImageFromFile',
                                    'file_client_args': {
                                        'backend': 'disk'
                                    }
                                }, {
                                    'type': 'LoadAnnotations',
                                    'with_bbox': True
                                }]
                            }, {
                                'type': 'YOLOv5RandomAffine',
                                'max_rotate_degree': 0.0,
                                'max_shear_degree': 0.0,
                                'max_translate_ratio': 0.2,
                                'scaling_ratio_range': (0.1, 2.0),
                                'border': (-320, -320),
                                'border_val': (114, 114, 114)
                            }]],
                prob=[0.8, 0.2])
        ]),
    dict(type='YOLOv5HSVRandomAug'),
    dict(type='mmdet.RandomFlip', prob=0.5),
    dict(
        type='mmdet.PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
                   'flip_direction'))
]
train_dataloader = dict(
    batch_size=8,
    num_workers=8,
    persistent_workers=True,
    pin_memory=True,
    sampler=dict(type='DefaultSampler', shuffle=True),
    collate_fn=dict(type='yolov5_collate'),
    dataset=dict(
        type='YOLOv5CocoDataset',
        data_root='data/banana/',
        metainfo=dict(CLASSES=('0', ), PALETTE=[(255, 0, 0)]),
        ann_file='annotations/instances_train2017.json',
        data_prefix=dict(img='train2017/'),
        filter_cfg=dict(filter_empty_gt=False, min_size=32),
        pipeline=[
            dict(
                type='LoadImageFromFile',
                file_client_args=dict(backend='disk')),
            dict(type='LoadAnnotations', with_bbox=True),
            dict(
                type='RandomChoice',
                transforms=[[{
                    'type':
                    'Mosaic',
                    'img_scale': (640, 640),
                    'pad_val':
                    114.0,
                    'pre_transform': [{
                        'type': 'LoadImageFromFile',
                        'file_client_args': {
                            'backend': 'disk'
                        }
                    }, {
                        'type': 'LoadAnnotations',
                        'with_bbox': True
                    }]
                }, {
                    'type': 'YOLOv5RandomAffine',
                    'max_rotate_degree': 0.0,
                    'max_shear_degree': 0.0,
                    'max_translate_ratio': 0.2,
                    'scaling_ratio_range': (0.1, 2.0),
                    'border': (-320, -320),
                    'border_val': (114, 114, 114)
                }],
                            [{
                                'type':
                                'Mosaic9',
                                'img_scale': (640, 640),
                                'pad_val':
                                114.0,
                                'pre_transform': [{
                                    'type': 'LoadImageFromFile',
                                    'file_client_args': {
                                        'backend': 'disk'
                                    }
                                }, {
                                    'type': 'LoadAnnotations',
                                    'with_bbox': True
                                }]
                            }, {
                                'type': 'YOLOv5RandomAffine',
                                'max_rotate_degree': 0.0,
                                'max_shear_degree': 0.0,
                                'max_translate_ratio': 0.2,
                                'scaling_ratio_range': (0.1, 2.0),
                                'border': (-320, -320),
                                'border_val': (114, 114, 114)
                            }]],
                prob=[0.8, 0.2]),
            dict(
                type='YOLOv5MixUp',
                alpha=8.0,
                beta=8.0,
                prob=0.38,
                pre_transform=[
                    dict(
                        type='LoadImageFromFile',
                        file_client_args=dict(backend='disk')),
                    dict(type='LoadAnnotations', with_bbox=True),
                    dict(
                        type='RandomChoice',
                        transforms=[[{
                            'type':
                            'Mosaic',
                            'img_scale': (640, 640),
                            'pad_val':
                            114.0,
                            'pre_transform': [{
                                'type': 'LoadImageFromFile',
                                'file_client_args': {
                                    'backend': 'disk'
                                }
                            }, {
                                'type': 'LoadAnnotations',
                                'with_bbox': True
                            }]
                        }, {
                            'type': 'YOLOv5RandomAffine',
                            'max_rotate_degree': 0.0,
                            'max_shear_degree': 0.0,
                            'max_translate_ratio': 0.2,
                            'scaling_ratio_range': (0.1, 2.0),
                            'border': (-320, -320),
                            'border_val': (114, 114, 114)
                        }],
                                    [{
                                        'type':
                                        'Mosaic9',
                                        'img_scale': (640, 640),
                                        'pad_val':
                                        114.0,
                                        'pre_transform': [{
                                            'type': 'LoadImageFromFile',
                                            'file_client_args': {
                                                'backend': 'disk'
                                            }
                                        }, {
                                            'type': 'LoadAnnotations',
                                            'with_bbox': True
                                        }]
                                    }, {
                                        'type': 'YOLOv5RandomAffine',
                                        'max_rotate_degree': 0.0,
                                        'max_shear_degree': 0.0,
                                        'max_translate_ratio': 0.2,
                                        'scaling_ratio_range': (0.1, 2.0),
                                        'border': (-320, -320),
                                        'border_val': (114, 114, 114)
                                    }]],
                        prob=[0.8, 0.2])
                ]),
            dict(type='YOLOv5HSVRandomAug'),
            dict(type='mmdet.RandomFlip', prob=0.5),
            dict(
                type='mmdet.PackDetInputs',
                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                           'flip', 'flip_direction'))
        ]))
test_pipeline = [
    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
    dict(type='YOLOv5KeepRatioResize', scale=(640, 640)),
    dict(
        type='LetterResize',
        scale=(640, 640),
        allow_scale_up=False,
        pad_val=dict(img=114)),
    dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
    dict(
        type='mmdet.PackDetInputs',
        meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                   'scale_factor', 'pad_param'))
]
val_dataloader = dict(
    batch_size=1,
    num_workers=2,
    persistent_workers=True,
    pin_memory=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type='YOLOv5CocoDataset',
        data_root='data/banana/',
        metainfo=dict(CLASSES=('0', ), PALETTE=[(255, 0, 0)]),
        test_mode=True,
        data_prefix=dict(img='val2017/'),
        ann_file='annotations/instances_val2017.json',
        pipeline=[
            dict(
                type='LoadImageFromFile',
                file_client_args=dict(backend='disk')),
            dict(type='YOLOv5KeepRatioResize', scale=(640, 640)),
            dict(
                type='LetterResize',
                scale=(640, 640),
                allow_scale_up=False,
                pad_val=dict(img=114)),
            dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
            dict(
                type='mmdet.PackDetInputs',
                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                           'scale_factor', 'pad_param'))
        ],
        batch_shapes_cfg=dict(
            type='BatchShapePolicy',
            batch_size=1,
            img_size=640,
            size_divisor=32,
            extra_pad_ratio=0.5)))
test_dataloader = dict(
    batch_size=1,
    num_workers=2,
    persistent_workers=True,
    pin_memory=True,
    drop_last=False,
    sampler=dict(type='DefaultSampler', shuffle=False),
    dataset=dict(
        type='YOLOv5CocoDataset',
        data_root='data/banana/',
        metainfo=dict(CLASSES=('0', ), PALETTE=[(255, 0, 0)]),
        test_mode=True,
        data_prefix=dict(img='val2017/'),
        ann_file='annotations/instances_val2017.json',
        pipeline=[
            dict(
                type='LoadImageFromFile',
                file_client_args=dict(backend='disk')),
            dict(type='YOLOv5KeepRatioResize', scale=(640, 640)),
            dict(
                type='LetterResize',
                scale=(640, 640),
                allow_scale_up=False,
                pad_val=dict(img=114)),
            dict(type='LoadAnnotations', with_bbox=True, _scope_='mmdet'),
            dict(
                type='mmdet.PackDetInputs',
                meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
                           'scale_factor', 'pad_param'))
        ],
        batch_shapes_cfg=dict(
            type='BatchShapePolicy',
            batch_size=1,
            img_size=640,
            size_divisor=32,
            extra_pad_ratio=0.5)))
param_scheduler = None
optim_wrapper = dict(
    type='OptimWrapper',
    optimizer=dict(
        type='SGD',
        lr=0.01,
        momentum=0.937,
        weight_decay=0.0005,
        nesterov=True,
        batch_size_per_gpu=8),
    constructor='YOLOv7OptimWrapperConstructor')
val_evaluator = dict(
    type='mmdet.CocoMetric',
    proposal_nums=(100, 1, 10),
    ann_file='data/banana/annotations/instances_val2017.json',
    metric='bbox')
test_evaluator = dict(
    type='mmdet.CocoMetric',
    proposal_nums=(100, 1, 10),
    ann_file='data/banana/annotations/instances_val2017.json',
    metric='bbox')
train_cfg = dict(
    type='EpochBasedTrainLoop',
    max_epochs=300,
    val_interval=1,
    dynamic_intervals=[(270, 1)])
custom_hooks = [
    dict(
        type='EMAHook',
        ema_type='ExpMomentumEMA',
        momentum=0.0001,
        update_buffers=True,
        strict_load=False,
        priority=49)
]
val_cfg = dict(type='ValLoop')
test_cfg = dict(type='TestLoop')
evaluation = dict(interval=1)
work_dir = './work_dirs/yolov7_x_fl'
launcher = 'pytorch'
# shell
./tools/dist_train.sh configs/yolov7/yolov7_x_syncbn_fast_8x16b-300e_coco.py 2
#error message
Traceback (most recent call last):
  File "./tools/train.py", line 106, in <module>
    main()
  File "./tools/train.py", line 102, in main
    runner.train()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 1684, in train
    model = self.train_loop.run()  # type: ignore
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 90, in run
    self.run_epoch()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 106, in run_epoch
    self.run_iter(idx, data_batch)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 123, in run_iter
    data_batch, optim_wrapper=self.runner.optim_wrapper)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 121, in train_step
    losses = self._run_forward(data, mode='loss')
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 161, in _run_forward
    results = self(**data, mode=mode)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/base.py", line 92, in forward
    return self.loss(inputs, data_samples)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/single_stage.py", line 78, in loss
    losses = self.bbox_head.loss(x, batch_data_samples)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov5_head.py", line 450, in loss
    losses = self.loss_by_feat(*loss_inputs)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 278, in loss_by_feat
    device=device)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 353, in _calc_loss
    target_obj) * self.obj_level_weights[i]
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/focal_loss.py", line 240, in forward
    avg_factor=avg_factor)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/focal_loss.py", line 140, in sigmoid_focal_loss
    alpha, None, 'none')
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmcv/ops/focal_loss.py", line 30, in forward
    assert input.dim() == 2
AssertionError
Traceback (most recent call last):
  File "./tools/train.py", line 106, in <module>
    main()
  File "./tools/train.py", line 102, in main
    runner.train()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 1684, in train
    model = self.train_loop.run()  # type: ignore
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 90, in run
    self.run_epoch()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 106, in run_epoch
    self.run_iter(idx, data_batch)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 123, in run_iter
    data_batch, optim_wrapper=self.runner.optim_wrapper)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 121, in train_step
    losses = self._run_forward(data, mode='loss')
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 161, in _run_forward
    results = self(**data, mode=mode)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/base.py", line 92, in forward
    return self.loss(inputs, data_samples)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/single_stage.py", line 78, in loss
    losses = self.bbox_head.loss(x, batch_data_samples)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov5_head.py", line 450, in loss
    losses = self.loss_by_feat(*loss_inputs)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 278, in loss_by_feat
    device=device)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 353, in _calc_loss
    target_obj) * self.obj_level_weights[i]
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/focal_loss.py", line 240, in forward
    avg_factor=avg_factor)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/focal_loss.py", line 140, in sigmoid_focal_loss
    alpha, None, 'none')
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmcv/ops/focal_loss.py", line 30, in forward
    assert input.dim() == 2
AssertionError
terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: driver shutting down
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from query at ../aten/src/ATen/cuda/CUDAEvent.h:95 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f76e6fd1d62 in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x11a (0x7f77442a39ba in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libtorch_cuda_cpp.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x50 (0x7f77442a5cb0 in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libtorch_cuda_cpp.so)
frame #3: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x11c (0x7f77442a677c in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libtorch_cuda_cpp.so)
frame #4: <unknown function> + 0xbd6df (0x7f77af0706df in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #5: <unknown function> + 0x76db (0x7f77b66c86db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #6: clone + 0x3f (0x7f77b63f161f in /lib/x86_64-linux-gnu/libc.so.6)

ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 8563) of binary: /home/xux/anaconda3/envs/zzza_py36/bin/python
Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 193, in <module>
    main()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 189, in main
    launch(args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 174, in launch
    run(args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/run.py", line 713, in run
    )(*cmd_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
    failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
./tools/train.py FAILED
------------------------------------------------------------
Failures:
[1]:
  time      : 2022-12-02_15:11:49
  host      : xux
  rank      : 1 (local_rank: 1)
  exitcode  : 1 (pid: 8564)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2022-12-02_15:11:49
  host      : xux
  rank      : 0 (local_rank: 0)
  exitcode  : -6 (pid: 8563)
  error_file: <N/A>
  traceback : Signal 6 (SIGABRT) received by PID 8563
============================================================

Environment

#env message
/home/xux/anaconda3/envs/zzza_py36/bin/python3.6 /home/xux/CaiLiYuan/Project/mmyolo/mmyolo/utils/collect_env.py
sys.platform: linux
Python: 3.6.13 |Anaconda, Inc.| (default, Jun  4 2021, 14:25:59) [GCC 7.5.0]
CUDA available: True
numpy_random_seed: 2147483648
GPU 0,1: NVIDIA GeForce RTX 3090
CUDA_HOME: /usr/local/cuda
NVCC: Cuda compilation tools, release 11.3, V11.3.58
GCC: gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
PyTorch: 1.10.2+cu113
PyTorch compiling details: PyTorch built with:
  - GCC 7.3
  - C++ Version: 201402
  - Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191122 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)
  - OpenMP 201511 (a.k.a. OpenMP 4.5)
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: AVX512
  - CUDA Runtime 11.3
  - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86
  - CuDNN 8.2
  - Magma 2.5.2
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.2.0, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.2, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON,

TorchVision: 0.11.3+cu113
OpenCV: 4.5.5
MMEngine: 0.3.2
MMCV: 2.0.0rc3
MMDetection: 3.0.0rc4
MMYOLO: 0.2.0+27487fd

Process finished with exit code 0

Additional information

1.我仅修改了configs/yolov7/yolov7_l_syncbn_fast_8x16b-300e_coco.py中model->bbox_head->loss_cls,loss_obj->type='mmdet.FocalLoss',修改前是能正常训练的. 2.我用的是自己的数据集,并将其转为coco格式,已能正常训练及验证.

hhaAndroid commented 1 year ago

@diplomatist Thank you for your feedback. I'll check it

diplomatist commented 1 year ago

when I use mmdet.DistributionFocalLoss or mmdet.GaussianFocalLoss ,bug is

# mmdet.DistributionFocalLoss or  mmdet.GaussianFocalLoss bug
2/05 23:06:20 - mmengine - INFO - Result has been saved to /home/xux/CaiLiYuan/Project/mmyolo/work_dirs/yolov7_x_Dfl/modules_statistic_results.json
Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
TypeError: __init__() got an unexpected keyword argument 'use_sigmoid'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 192, in __init__
    super().__init__(*args, **kwargs)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov5_head.py", line 185, in __init__
    self.loss_cls: nn.Module = MODELS.build(loss_cls)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/registry.py", line 454, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 240, in build_model_from_cfg
    return build_from_cfg(cfg, registry, default_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 136, in build_from_cfg
    f'class `{obj_cls.__name__}` in '  # type: ignore
TypeError: class `DistributionFocalLoss` in mmdet/models/losses/gfocal_loss.py: __init__() got an unexpected keyword argument 'use_sigmoid'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/detectors/yolo_detector.py", line 48, in __init__
    init_cfg=init_cfg)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/single_stage.py", line 35, in __init__
    self.bbox_head = MODELS.build(bbox_head)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/registry.py", line 454, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 240, in build_model_from_cfg
    return build_from_cfg(cfg, registry, default_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 136, in build_from_cfg
    f'class `{obj_cls.__name__}` in '  # type: ignore
TypeError: class `YOLOv7Head` in mmyolo/models/dense_heads/yolov7_head.py: class `DistributionFocalLoss` in mmdet/models/losses/gfocal_loss.py: __init__() got an unexpected keyword argument 'use_sigmoid'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "./tools/train.py", line 106, in <module>
    main()
  File "./tools/train.py", line 95, in main
    runner = Runner.from_cfg(cfg)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 464, in from_cfg
    cfg=cfg,
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 404, in __init__
    self.model = self.build_model(model)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 806, in build_model
    model = MODELS.build(model)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/registry.py", line 454, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 240, in build_model_from_cfg
    return build_from_cfg(cfg, registry, default_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 136, in build_from_cfg
    f'class `{obj_cls.__name__}` in '  # type: ignore
TypeError: class `YOLODetector` in mmyolo/models/detectors/yolo_detector.py: class `YOLOv7Head` in mmyolo/models/dense_heads/yolov7_head.py: class `DistributionFocalLoss` in mmdet/models/losses/gfocal_loss.py: __init__() got an unexpected keyword argument 'use_sigmoid'
Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
TypeError: __init__() got an unexpected keyword argument 'use_sigmoid'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 192, in __init__
    super().__init__(*args, **kwargs)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov5_head.py", line 185, in __init__
    self.loss_cls: nn.Module = MODELS.build(loss_cls)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/registry.py", line 454, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 240, in build_model_from_cfg
    return build_from_cfg(cfg, registry, default_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 136, in build_from_cfg
    f'class `{obj_cls.__name__}` in '  # type: ignore
TypeError: class `DistributionFocalLoss` in mmdet/models/losses/gfocal_loss.py: __init__() got an unexpected keyword argument 'use_sigmoid'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
    obj = obj_cls(**args)  # type: ignore
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/detectors/yolo_detector.py", line 48, in __init__
    init_cfg=init_cfg)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/single_stage.py", line 35, in __init__
    self.bbox_head = MODELS.build(bbox_head)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/registry.py", line 454, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 240, in build_model_from_cfg
    return build_from_cfg(cfg, registry, default_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 136, in build_from_cfg
    f'class `{obj_cls.__name__}` in '  # type: ignore
TypeError: class `YOLOv7Head` in mmyolo/models/dense_heads/yolov7_head.py: class `DistributionFocalLoss` in mmdet/models/losses/gfocal_loss.py: __init__() got an unexpected keyword argument 'use_sigmoid'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "./tools/train.py", line 106, in <module>
    main()
  File "./tools/train.py", line 95, in main
    runner = Runner.from_cfg(cfg)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 464, in from_cfg
    cfg=cfg,
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 404, in __init__
    self.model = self.build_model(model)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 806, in build_model
    model = MODELS.build(model)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/registry.py", line 454, in build
    return self.build_func(cfg, *args, **kwargs, registry=self)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 240, in build_model_from_cfg
    return build_from_cfg(cfg, registry, default_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/registry/build_functions.py", line 136, in build_from_cfg
    f'class `{obj_cls.__name__}` in '  # type: ignore
TypeError: class `YOLODetector` in mmyolo/models/detectors/yolo_detector.py: class `YOLOv7Head` in mmyolo/models/dense_heads/yolov7_head.py: class `DistributionFocalLoss` in mmdet/models/losses/gfocal_loss.py: __init__() got an unexpected keyword argument 'use_sigmoid'
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 24137) of binary: /home/xux/anaconda3/envs/zzza_py36/bin/python
Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 193, in <module>
    main()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 189, in main
    launch(args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 174, in launch
    run(args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/run.py", line 713, in run
    )(*cmd_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
    failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
./tools/train.py FAILED
------------------------------------------------------------

when I use mmdet.QualityFocalLoss,bug is:

# mmdet.QualityFocalLoss bug
Traceback (most recent call last):
  File "./tools/train.py", line 106, in <module>
    main()
  File "./tools/train.py", line 102, in main
    runner.train()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 1684, in train
Traceback (most recent call last):
  File "./tools/train.py", line 106, in <module>
    main()
  File "./tools/train.py", line 102, in main
    runner.train()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/runner.py", line 1684, in train
    model = self.train_loop.run()  # type: ignore
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 90, in run
    self.run_epoch()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 106, in run_epoch
    model = self.train_loop.run()  # type: ignore
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 90, in run
    self.run_epoch()
    self.run_iter(idx, data_batch)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 106, in run_epoch
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 123, in run_iter
    self.run_iter(idx, data_batch)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/runner/loops.py", line 123, in run_iter
    data_batch, optim_wrapper=self.runner.optim_wrapper)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 121, in train_step
    losses = self._run_forward(data, mode='loss')
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 161, in _run_forward
    results = self(**data, mode=mode)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    data_batch, optim_wrapper=self.runner.optim_wrapper)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 121, in train_step
    losses = self._run_forward(data, mode='loss')
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmengine/model/wrappers/distributed.py", line 161, in _run_forward
    results = self(**data, mode=mode)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
    return forward_call(*input, **kwargs)
    output = self.module(*inputs[0], **kwargs[0])
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 886, in forward
    output = self.module(*inputs[0], **kwargs[0])
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/base.py", line 92, in forward
    return self.loss(inputs, data_samples)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/single_stage.py", line 78, in loss
    losses = self.bbox_head.loss(x, batch_data_samples)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov5_head.py", line 450, in loss
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/base.py", line 92, in forward
    return self.loss(inputs, data_samples)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/detectors/single_stage.py", line 78, in loss
    losses = self.loss_by_feat(*loss_inputs)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 278, in loss_by_feat
    losses = self.bbox_head.loss(x, batch_data_samples)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov5_head.py", line 450, in loss
    device=device)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 353, in _calc_loss
    losses = self.loss_by_feat(*loss_inputs)
    target_obj) * self.obj_level_weights[i]
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 278, in loss_by_feat
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    device=device)
  File "/home/xux/CaiLiYuan/Project/mmyolo/mmyolo/models/dense_heads/yolov7_head.py", line 353, in _calc_loss
    target_obj) * self.obj_level_weights[i]
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/gfocal_loss.py", line 193, in forward
    avg_factor=avg_factor)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/utils.py", line 99, in wrapper
    loss = loss_func(pred, target, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/gfocal_loss.py", line 28, in quality_focal_loss
    including category label and quality label, respectively"""
    return forward_call(*input, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/gfocal_loss.py", line 193, in forward
AssertionError: target for QFL must be a tuple of two elements,
        including category label and quality label, respectively
    avg_factor=avg_factor)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/utils.py", line 99, in wrapper
    loss = loss_func(pred, target, **kwargs)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/mmdet/models/losses/gfocal_loss.py", line 28, in quality_focal_loss
    including category label and quality label, respectively"""
AssertionError: target for QFL must be a tuple of two elements,
        including category label and quality label, respectively
terminate called after throwing an instance of 'c10::CUDAError'
  what():  CUDA error: driver shutting down
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Exception raised from query at ../aten/src/ATen/cuda/CUDAEvent.h:95 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7f1524215d62 in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: c10d::ProcessGroupNCCL::WorkNCCL::finishedGPUExecutionInternal() const + 0x11a (0x7f15814e79ba in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libtorch_cuda_cpp.so)
frame #2: c10d::ProcessGroupNCCL::WorkNCCL::isCompleted() + 0x50 (0x7f15814e9cb0 in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libtorch_cuda_cpp.so)
frame #3: c10d::ProcessGroupNCCL::workCleanupLoop() + 0x11c (0x7f15814ea77c in /home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/lib/libtorch_cuda_cpp.so)
frame #4: <unknown function> + 0xbd6df (0x7f15ec2b46df in /usr/lib/x86_64-linux-gnu/libstdc++.so.6)
frame #5: <unknown function> + 0x76db (0x7f15f390c6db in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #6: clone + 0x3f (0x7f15f363561f in /lib/x86_64-linux-gnu/libc.so.6)

ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 24553) of binary: /home/xux/anaconda3/envs/zzza_py36/bin/python
Traceback (most recent call last):
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 193, in <module>
    main()
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 189, in main
    launch(args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 174, in launch
    run(args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/run.py", line 713, in run
    )(*cmd_args)
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/xux/anaconda3/envs/zzza_py36/lib/python3.6/site-packages/torch/distributed/launcher/api.py", line 261, in launch_agent
    failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
./tools/train.py FAILED
------------------------------------------------------------

@hhaAndroid

hhaAndroid commented 1 year ago

@diplomatist We're working on this, wait a minute.