open-mmlab / mmdetection

OpenMMLab Detection Toolbox and Benchmark
https://mmdetection.readthedocs.io
Apache License 2.0
29.25k stars 9.41k forks source link

AssertionError: loss log variables are different across GPUs! I used cascade Rcnn to train object365,and I have processed the dataset. However,the loss always turn to NAN and it can not run the first epoch successfully. #9291

Open imzhaoruijie opened 1 year ago

imzhaoruijie commented 1 year ago

Prerequisite

Task

I have modified the scripts/configs, or I'm working on my own tasks/models/datasets.

Branch

master branch https://github.com/open-mmlab/mmdetection

Environment

TorchVision: 0.9.1 OpenCV: 4.6.0 MMCV: 1.5.3 MMCV Compiler: GCC 7.5 MMCV CUDA Compiler: 11.1 MMDetection: 2.25.0+unknown

Reproduces the problem - code sample

If the loss_vars has different length, GPUs will wait infinitely

    if dist.is_available() and dist.is_initialized():
        log_var_length = torch.tensor(len(log_vars), device=loss.device)
        dist.all_reduce(log_var_length)
        message = (f'rank {dist.get_rank()}' +
                   f' len(log_vars): {len(log_vars)}' + ' keys: ' +
                   ','.join(log_vars.keys()))
        # print("log_var_length:",log_var_length) #tensor(66, device='cuda:5') 
        # print("len(log_vars):",len(log_vars)) #11
        # print("log_vars")
        # print(log_vars)
        # print("dist.get_world_size():",dist.get_world_size()) #6
        if len(log_vars)!=11:
            print("log_vars:",log_vars)
            log_vars["loss_rpn_cls"]=torch.tensor(0.,device=loss.device)
            log_vars["loss_rpn_bbox"]=torch.tensor(0., device=loss.device)
            log_vars["s0.loss_cls"]=torch.tensor(0., device=loss.device)
            log_vars["s0.acc"]=torch.tensor(0., device=loss.device)
            log_vars["s0.loss_bbox"]=torch.tensor(0., device=loss.device)
            log_vars["s1.loss_cls"]=torch.tensor(0., device=loss.device)
            log_vars["s1.acc"]=torch.tensor(0., device=loss.device)
            log_vars["s1.loss_bbox"]=torch.tensor(0., device=loss.device)
            log_vars["s2.loss_cls"]=torch.tensor(0., device=loss.device)
            log_vars["s2.acc"]=torch.tensor(0., device=loss.device)
            log_vars["s2.loss_bbox"]=torch.tensor(0., device=loss.device)

        assert log_var_length == len(log_vars) * dist.get_world_size(), \
            'loss log variables are different across GPUs!\n' + message

Reproduces the problem - command or script

pretrainedpath = '/mnt/home/mmdetection/swin_large_patch4_window7_224_22k.pth' model = dict( type='CascadeRCNN', backbone=dict( type='SwinTransformer', embed_dims=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7, mlp_ratio=4, qkv_bias=True, qk_scale=None, drop_rate=0.0, attn_drop_rate=0.0, drop_path_rate=0.2, patch_norm=True, out_indices=(0, 1, 2, 3), with_cp=False, convert_weights=True, init_cfg=dict( type='Pretrained', checkpoint= '/mnt/home/mmdetection/swin_large_patch4_window7_224_22k.pth')), neck=dict( type='FPN', in_channels=[192, 384, 768, 1536], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=0.1111111111111111, loss_weight=1.0)), roi_head=dict( type='CascadeRoIHead', num_stages=3, stage_loss_weights=[1, 0.5, 0.25], bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=365, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=365, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.05, 0.05, 0.1, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=365, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0.0, 0.0, 0.0, 0.0], target_stds=[0.033, 0.033, 0.067, 0.067]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ]), train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=2000, max_per_img=2000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ]), test_cfg=dict( rpn=dict( nms_pre=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))) dataset_type = 'CocoDataset' data_root = '/mnt/home/dataset/object365/' classes = ( 'human', 'sneakers', 'chair', 'hat', 'lamp', 'bottle', 'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk', 'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'monitor', 'storage box', 'plants pot/vase', 'bench', 'wine glass', 'boots', 'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can', 'stool', 'backpack', 'sofa', 'belt', 'carpet', 'basket', 'towel/napkin', 'slippers', 'bowl', 'barrel/bucket', 'coffee table', 'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'combine with bowl', 'sink', 'horse', 'fish', 'apple', 'traffic sign', 'camera', 'candle', 'stuffed animal', 'cake', 'motorbike/motorcycle', 'wild bird', 'laptop', 'knife', 'cellphone', 'paddle', 'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey stick', 'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote', 'combine with glove', 'paper towel', 'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone', 'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun', 'Life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car', 'giraffe', 'pumpkin', 'Accordion/keyboard/piano', 'radiator', 'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies', 'cutting/chopping board', 'tennis racket', 'candy', 'skating and skiing shoes', 'scissors', 'folder', 'baseball', 'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine', 'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear', 'american football', 'basketball', 'potato', 'paint brush', 'printer', 'billiards', 'fire hydrant', 'goose', 'projector', 'sausage', 'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball', 'chopsticks', 'Electronic stove and gas stove', 'pie', 'frisbee', 'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender', 'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango', 'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion', 'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale', 'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple', 'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle', 'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar', 'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD', 'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado', 'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear', 'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn', 'key', 'screwdriver', 'globe', 'broom', 'pliers', 'hammer', 'volleyball', 'eggplant', 'trophy', 'board eraser', 'dates', 'rice', 'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel', 'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly', 'egg tart', 'cheese', 'pomelo', 'pig', 'race car', 'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair dryer', 'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup', 'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish', 'noodles', 'mop', 'yak', 'crab', 'microscope', 'barbell', 'Bread/bun', 'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'mangosteen', 'seal', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle', 'okra', 'starfish', 'monkey', 'eagle', 'durian', 'rabbit', 'game board', 'french horn', 'ambulance', 'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron', 'flashlight') img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type='CocoDataset', classes=( 'human', 'sneakers', 'chair', 'hat', 'lamp', 'bottle', 'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk', 'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'monitor', 'storage box', 'plants pot/vase', 'bench', 'wine glass', 'boots', 'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can', 'stool', 'backpack', 'sofa', 'belt', 'carpet', 'basket', 'towel/napkin', 'slippers', 'bowl', 'barrel/bucket', 'coffee table', 'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'combine with bowl', 'sink', 'horse', 'fish', 'apple', 'traffic sign', 'camera', 'candle', 'stuffed animal', 'cake', 'motorbike/motorcycle', 'wild bird', 'laptop', 'knife', 'cellphone', 'paddle', 'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey stick', 'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote', 'combine with glove', 'paper towel', 'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone', 'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun', 'Life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car', 'giraffe', 'pumpkin', 'Accordion/keyboard/piano', 'radiator', 'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies', 'cutting/chopping board', 'tennis racket', 'candy', 'skating and skiing shoes', 'scissors', 'folder', 'baseball', 'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine', 'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear', 'american football', 'basketball', 'potato', 'paint brush', 'printer', 'billiards', 'fire hydrant', 'goose', 'projector', 'sausage', 'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball', 'chopsticks', 'Electronic stove and gas stove', 'pie', 'frisbee', 'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender', 'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango', 'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion', 'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale', 'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple', 'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle', 'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar', 'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD', 'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado', 'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear', 'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn', 'key', 'screwdriver', 'globe', 'broom', 'pliers', 'hammer', 'volleyball', 'eggplant', 'trophy', 'board eraser', 'dates', 'rice', 'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel', 'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly', 'egg tart', 'cheese', 'pomelo', 'pig', 'race car', 'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair dryer', 'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup', 'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish', 'noodles', 'mop', 'yak', 'crab', 'microscope', 'barbell', 'Bread/bun', 'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'mangosteen', 'seal', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle', 'okra', 'starfish', 'monkey', 'eagle', 'durian', 'rabbit', 'game board', 'french horn', 'ambulance', 'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron', 'flashlight'), ann_file= '/mnt/home/dataset/object365/annotations/zhiyuan_objv2_train.json', img_prefix='/mnt/home/dataset/object365/train/', filter_empty_gt=True, pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']) ]), val=dict( type='CocoDataset', classes=( 'human', 'sneakers', 'chair', 'hat', 'lamp', 'bottle', 'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk', 'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'monitor', 'storage box', 'plants pot/vase', 'bench', 'wine glass', 'boots', 'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can', 'stool', 'backpack', 'sofa', 'belt', 'carpet', 'basket', 'towel/napkin', 'slippers', 'bowl', 'barrel/bucket', 'coffee table', 'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'combine with bowl', 'sink', 'horse', 'fish', 'apple', 'traffic sign', 'camera', 'candle', 'stuffed animal', 'cake', 'motorbike/motorcycle', 'wild bird', 'laptop', 'knife', 'cellphone', 'paddle', 'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey stick', 'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote', 'combine with glove', 'paper towel', 'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone', 'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun', 'Life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car', 'giraffe', 'pumpkin', 'Accordion/keyboard/piano', 'radiator', 'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies', 'cutting/chopping board', 'tennis racket', 'candy', 'skating and skiing shoes', 'scissors', 'folder', 'baseball', 'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine', 'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear', 'american football', 'basketball', 'potato', 'paint brush', 'printer', 'billiards', 'fire hydrant', 'goose', 'projector', 'sausage', 'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball', 'chopsticks', 'Electronic stove and gas stove', 'pie', 'frisbee', 'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender', 'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango', 'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion', 'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale', 'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple', 'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle', 'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar', 'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD', 'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado', 'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear', 'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn', 'key', 'screwdriver', 'globe', 'broom', 'pliers', 'hammer', 'volleyball', 'eggplant', 'trophy', 'board eraser', 'dates', 'rice', 'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel', 'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly', 'egg tart', 'cheese', 'pomelo', 'pig', 'race car', 'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair dryer', 'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup', 'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish', 'noodles', 'mop', 'yak', 'crab', 'microscope', 'barbell', 'Bread/bun', 'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'mangosteen', 'seal', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle', 'okra', 'starfish', 'monkey', 'eagle', 'durian', 'rabbit', 'game board', 'french horn', 'ambulance', 'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron', 'flashlight'), ann_file= '/mnt/home/dataset/object365/annotations/zhiyuan_objv2_val.json', img_prefix='/mnt/home/dataset/object365/val/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='CocoDataset', classes=( 'human', 'sneakers', 'chair', 'hat', 'lamp', 'bottle', 'cabinet/shelf', 'cup', 'car', 'glasses', 'picture/frame', 'desk', 'handbag', 'street lights', 'book', 'plate', 'helmet', 'leather shoes', 'pillow', 'glove', 'potted plant', 'bracelet', 'flower', 'monitor', 'storage box', 'plants pot/vase', 'bench', 'wine glass', 'boots', 'dining table', 'umbrella', 'boat', 'flag', 'speaker', 'trash bin/can', 'stool', 'backpack', 'sofa', 'belt', 'carpet', 'basket', 'towel/napkin', 'slippers', 'bowl', 'barrel/bucket', 'coffee table', 'suv', 'toy', 'tie', 'bed', 'traffic light', 'pen/pencil', 'microphone', 'sandals', 'canned', 'necklace', 'mirror', 'faucet', 'bicycle', 'bread', 'high heels', 'ring', 'van', 'watch', 'combine with bowl', 'sink', 'horse', 'fish', 'apple', 'traffic sign', 'camera', 'candle', 'stuffed animal', 'cake', 'motorbike/motorcycle', 'wild bird', 'laptop', 'knife', 'cellphone', 'paddle', 'truck', 'cow', 'power outlet', 'clock', 'drum', 'fork', 'bus', 'hanger', 'nightstand', 'pot/pan', 'sheep', 'guitar', 'traffic cone', 'tea pot', 'keyboard', 'tripod', 'hockey stick', 'fan', 'dog', 'spoon', 'blackboard/whiteboard', 'balloon', 'air conditioner', 'cymbal', 'mouse', 'telephone', 'pickup truck', 'orange', 'banana', 'airplane', 'luggage', 'skis', 'soccer', 'trolley', 'oven', 'remote', 'combine with glove', 'paper towel', 'refrigerator', 'train', 'tomato', 'machinery vehicle', 'tent', 'shampoo/shower gel', 'head phone', 'lantern', 'donut', 'cleaning products', 'sailboat', 'tangerine', 'pizza', 'kite', 'computer box', 'elephant', 'toiletries', 'gas stove', 'broccoli', 'toilet', 'stroller', 'shovel', 'baseball bat', 'microwave', 'skateboard', 'surfboard', 'surveillance camera', 'gun', 'Life saver', 'cat', 'lemon', 'liquid soap', 'zebra', 'duck', 'sports car', 'giraffe', 'pumpkin', 'Accordion/keyboard/piano', 'radiator', 'converter', 'tissue ', 'carrot', 'washing machine', 'vent', 'cookies', 'cutting/chopping board', 'tennis racket', 'candy', 'skating and skiing shoes', 'scissors', 'folder', 'baseball', 'strawberry', 'bow tie', 'pigeon', 'pepper', 'coffee machine', 'bathtub', 'snowboard', 'suitcase', 'grapes', 'ladder', 'pear', 'american football', 'basketball', 'potato', 'paint brush', 'printer', 'billiards', 'fire hydrant', 'goose', 'projector', 'sausage', 'fire extinguisher', 'extension cord', 'facial mask', 'tennis ball', 'chopsticks', 'Electronic stove and gas stove', 'pie', 'frisbee', 'kettle', 'hamburger', 'golf club', 'cucumber', 'clutch', 'blender', 'tong', 'slide', 'hot dog', 'toothbrush', 'facial cleanser', 'mango', 'deer', 'egg', 'violin', 'marker', 'ship', 'chicken', 'onion', 'ice cream', 'tape', 'wheelchair', 'plum', 'bar soap', 'scale', 'watermelon', 'cabbage', 'router/modem', 'golf ball', 'pine apple', 'crane', 'fire truck', 'peach', 'cello', 'notepaper', 'tricycle', 'toaster', 'helicopter', 'green beans', 'brush', 'carriage', 'cigar', 'earphone', 'penguin', 'hurdle', 'swing', 'radio', 'CD', 'parking meter', 'swan', 'garlic', 'french fries', 'horn', 'avocado', 'saxophone', 'trumpet', 'sandwich', 'cue', 'kiwi fruit', 'bear', 'fishing rod', 'cherry', 'tablet', 'green vegetables', 'nuts', 'corn', 'key', 'screwdriver', 'globe', 'broom', 'pliers', 'hammer', 'volleyball', 'eggplant', 'trophy', 'board eraser', 'dates', 'rice', 'tape measure/ruler', 'dumbbell', 'hamimelon', 'stapler', 'camel', 'lettuce', 'goldfish', 'meat balls', 'medal', 'toothpaste', 'antelope', 'shrimp', 'rickshaw', 'trombone', 'pomegranate', 'coconut', 'jellyfish', 'mushroom', 'calculator', 'treadmill', 'butterfly', 'egg tart', 'cheese', 'pomelo', 'pig', 'race car', 'rice cooker', 'tuba', 'crosswalk sign', 'papaya', 'hair dryer', 'green onion', 'chips', 'dolphin', 'sushi', 'urinal', 'donkey', 'electric drill', 'spring rolls', 'tortoise/turtle', 'parrot', 'flute', 'measuring cup', 'shark', 'steak', 'poker card', 'binoculars', 'llama', 'radish', 'noodles', 'mop', 'yak', 'crab', 'microscope', 'barbell', 'Bread/bun', 'baozi', 'lion', 'red cabbage', 'polar bear', 'lighter', 'mangosteen', 'seal', 'comb', 'eraser', 'pitaya', 'scallop', 'pencil case', 'saw', 'table tennis paddle', 'okra', 'starfish', 'monkey', 'eagle', 'durian', 'rabbit', 'game board', 'french horn', 'ambulance', 'asparagus', 'hoverboard', 'pasta', 'target', 'hotair balloon', 'chainsaw', 'lobster', 'iron', 'flashlight'), ann_file= '/mnt/home/dataset/object365/annotations/zhiyuan_objv2_val.json', img_prefix='/mnt/home/dataset/object365/val/', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img']) ]) ])) evaluation = dict(interval=1, metric='bbox') optimizer = dict( type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05, paramwise_cfg=dict( custom_keys=dict( absolute_pos_embed=dict(decay_mult=0.0), relative_position_bias_table=dict(decay_mult=0.0), norm=dict(decay_mult=0.0)))) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[27, 33]) runner = dict(type='EpochBasedRunner', max_epochs=50) checkpoint_config = dict(interval=1) log_config = dict(interval=50, hooks=[dict(type='TextLoggerHook')]) custom_hooks = [dict(type='NumClassCheckHook')] dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] opencv_num_threads = 0 mp_start_method = 'fork' auto_scale_lr = dict(enable=True, base_batch_size=12) work_dir = './work_dirs/cascade_swinl-obj365' auto_resume = False gpu_ids = range(0, 6)

Reproduces the problem - error message

AssertionError: loss log variables are different across GPUs! rank 1 len(log_vars): 3 keys: loss_rpn_cls,loss_rpn_bbox,s0.loss_bbox

AssertionError: loss log variables are different across GPUs! rank 0 len(log_vars): 11 keys: loss_rpn_cls,loss_rpn_bbox,s0.loss_cls,s0.acc,s0.loss_bbox,s1.loss_cls,s1.acc,s1.loss_bbox,s2.loss_cls,s2.acc,s2.loss_bbox

File "/mnt/home/mmdetection/mmdet/models/detectors/base.py", line 249, in train_step loss, log_vars = self._parse_losses(losses) File "/mnt/home/mmdetection/mmdet/models/detectors/base.py", line 208, in _parse_losses assert log_var_length == len(log_vars) * dist.get_world_size(), \

Additional information

for the error in dataset, I have tried to filter the images with errors, and the codes are as follows:(in mmdet/datasets/coco.py, I add the function of is_out_of_frame) def _parse_ann_info(self, img_info, ann_info): gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] gt_masks_ann = []

    is_out_of_frame = lambda x, y: (
        x < 0 or x >= img_info['width'] or 
        y < 0 or y >= img_info['height']
    )

    for i, ann in enumerate(ann_info):
        if ann.get('ignore', False):
            continue
        x1, y1, w, h = ann['bbox']
        inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0))
        inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0))
        if inter_w * inter_h == 0:
            continue

        if is_out_of_frame(x1, y1) and is_out_of_frame(x1+w, y1+h):
            continue 
RangiLyu commented 1 year ago
AssertionError: loss log variables are different across GPUs!
rank 1 len(log_vars): 3 keys: loss_rpn_cls,loss_rpn_bbox,s0.loss_bbox

AssertionError: loss log variables are different across GPUs!
rank 0 len(log_vars): 11 keys: loss_rpn_cls,loss_rpn_bbox,s0.loss_cls,s0.acc,s0.loss_bbox,s1.loss_cls,s1.acc,s1.loss_bbox,s2.loss_cls,s2.acc,s2.loss_bbox

It seems that only the RPN is trained on rank 1, the cascade roi head has no loss. I'm not sure what caused this. You may need to debug and see why the roi head is not trained on some gpus.