zhengye1995 / underwater-object-detection

Kesci underwater object detection algorithm contest
203 stars 60 forks source link

使用多个GPU进行训练 #11

Closed zhangxiaohuixuhao closed 4 years ago

zhangxiaohuixuhao commented 4 years ago

您好,非常感谢您的贡献,在进行代码复现时,使用的环境时pytorch1.1.0,但是在用多个GPU进行训练测试时会出现以下错误: ./tools/dist_train.sh configs/underwater/cas_x101/cascade_rcnn_x101_64x4d_fpn_1x.py 4 2020-04-15 09:51:51,265 - INFO - Distributed training: True 2020-04-15 09:51:51,265 - INFO - MMDetection Version: 1.0rc1+3fe7433 2020-04-15 09:51:51,265 - INFO - Config: /DATA/ds/underwater-objection-detection/configs/underwater/cas_x101/cascade_rcnn_x101_64x4d_fpn_1x.py fp16 = dict(loss_scale=512.)

model settings

model = dict( type='CascadeRCNN', num_stages=3, pretrained=None, backbone=dict( type='ResNeXt', depth=101, groups=64, base_width=4, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_scales=[8], anchor_ratios=[0.5, 1.0, 2.0], anchor_strides=[4, 8, 16, 32, 64], target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0], loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', out_size=7, sample_num=2), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=5, target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2], reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=5, target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1], reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='SharedFCBBoxHead', num_fcs=2, in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=5, target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067], reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ])

model training and testing settings

train_cfg = dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=2000, max_num=2000, nms_thr=0.7, min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False) ], stage_loss_weights=[1, 0.5, 0.25]) test_cfg = dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_num=1000, nms_thr=0.7, min_bbox_size=0), rcnn=dict( score_thr=0.0001, nms=dict(type='soft_nms', iou_thr=0.5, min_score=0.0001), max_per_img=200))

dataset settings

dataset_type = 'Underwater' data_root = '/DATA/ds/underwater-objection-detection/data/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True), dict(type='Resize', img_scale=[(4096, 600), (4096, 1000)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=[(4096, 600), (4096, 800), (4096, 1000)], flip=True, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( imgs_per_gpu=1, workers_per_gpu=2, train=dict( type=dataset_type, ann_file='data/train/annotations/train.json', img_prefix=data_root + 'train/image/', pipeline=train_pipeline), test=dict( type=dataset_type, ann_file='data/train/annotations/testA.json', img_prefix=data_root + 'test-A-image/', pipeline=test_pipeline))

optimizer

optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))

learning policy

lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, step=[8, 11]) checkpoint_config = dict(interval=12)

yapf:disable

log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'),

dict(type='TensorboardLoggerHook')

])

yapf:enable

runtime settings

total_epochs = 12 dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = './work_dirs/cas_x101_64x4d_fpn_htc_1x' load_from = 'data/pretrained/cascade_rcnn_x101_64x4d_fpn_1x_20181218-e2dc376a.pth' resume_from = None workflow = [('train', 1)]

loading annotations into memory... loading annotations into memory... Done (t=0.25s) creating index... index created! Done (t=0.30s) creating index... index created! loading annotations into memory... loading annotations into memory... Done (t=0.26s) creating index... index created! Done (t=0.29s) creating index... index created! Traceback (most recent call last): File "./tools/train.py", line 124, in main() File "./tools/train.py", line 120, in main timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 125, in train_detector timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 230, in _dist_train model = MMDistributedDataParallel(model.cuda()) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 288, in init self._ddp_init_helper() File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 306, in _ddp_init_helper self._module_copies = replicate(self.module, self.device_ids, detach=True) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 97, in replicate param_copies = _broadcast_coalesced_reshape(params, devices, detach) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 76, in _broadcast_coalesced_reshape return comm.broadcast_coalesced(tensors, devices) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/cuda/comm.py", line 39, in broadcast_coalesced return torch._C._broadcast_coalesced(tensors, devices, buffer_size) RuntimeError: all tensors must be on devices[0] Traceback (most recent call last): File "./tools/train.py", line 124, in main() File "./tools/train.py", line 120, in main timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 125, in train_detector timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 230, in _dist_train model = MMDistributedDataParallel(model.cuda()) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 288, in init self._ddp_init_helper() File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 306, in _ddp_init_helper self._module_copies = replicate(self.module, self.device_ids, detach=True) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 97, in replicate param_copies = _broadcast_coalesced_reshape(params, devices, detach) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 76, in _broadcast_coalesced_reshape return comm.broadcast_coalesced(tensors, devices) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/cuda/comm.py", line 39, in broadcast_coalesced return torch._C._broadcast_coalesced(tensors, devices, buffer_size) RuntimeError: all tensors must be on devices[0] Traceback (most recent call last): File "./tools/train.py", line 124, in main() File "./tools/train.py", line 120, in main timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 125, in train_detector timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 230, in _dist_train model = MMDistributedDataParallel(model.cuda()) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 288, in init self._ddp_init_helper() File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 306, in _ddp_init_helper self._module_copies = replicate(self.module, self.device_ids, detach=True) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 97, in replicate param_copies = _broadcast_coalesced_reshape(params, devices, detach) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/replicate.py", line 76, in _broadcast_coalesced_reshape return comm.broadcast_coalesced(tensors, devices) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/cuda/comm.py", line 39, in broadcast_coalesced return torch._C._broadcast_coalesced(tensors, devices, buffer_size) RuntimeError: all tensors must be on devices[0] 2020-04-15 09:52:24,125 - INFO - load checkpoint from data/pretrained/cascade_rcnn_x101_64x4d_fpn_1x_20181218-e2dc376a.pth 2020-04-15 09:52:24,522 - WARNING - The model and loaded state dict do not match exactly

size mismatch for bbox_head.0.fc_cls.weight: copying a param with shape torch.Size([81, 1024]) from checkpoint, the shape in current model is torch.Size([5, 1024]). size mismatch for bbox_head.0.fc_cls.bias: copying a param with shape torch.Size([81]) from checkpoint, the shape in current model is torch.Size([5]). size mismatch for bbox_head.1.fc_cls.weight: copying a param with shape torch.Size([81, 1024]) from checkpoint, the shape in current model is torch.Size([5, 1024]). size mismatch for bbox_head.1.fc_cls.bias: copying a param with shape torch.Size([81]) from checkpoint, the shape in current model is torch.Size([5]). size mismatch for bbox_head.2.fc_cls.weight: copying a param with shape torch.Size([81, 1024]) from checkpoint, the shape in current model is torch.Size([5, 1024]). size mismatch for bbox_head.2.fc_cls.bias: copying a param with shape torch.Size([81]) from checkpoint, the shape in current model is torch.Size([5]). 2020-04-15 09:52:24,526 - INFO - Start running, host: ds@elm01, work_dir: /DATA/ds/underwater-objection-detection/work_dirs/cas_x101_64x4d_fpn_htc_1x 2020-04-15 09:52:24,526 - INFO - workflow: [('train', 1)], max: 12 epochs Traceback (most recent call last): File "./tools/train.py", line 124, in main() File "./tools/train.py", line 120, in main timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 125, in train_detector timestamp=timestamp) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 272, in _dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/DATA/ds/underwater-objection-detection/mmcv/mmcv/runner/runner.py", line 359, in run epoch_runner(data_loaders[i], kwargs) File "/DATA/ds/underwater-objection-detection/mmcv/mmcv/runner/runner.py", line 263, in train self.model, data_batch, train_mode=True, kwargs) File "/DATA/ds/underwater-objection-detection/mmdet/apis/train.py", line 100, in batch_processor losses = model(*data) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/modules/module.py", line 493, in call result = self.forward(input, **kwargs) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 372, in forward self._sync_params() File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/nn/parallel/distributed.py", line 424, in _syncparams param.set(tensor) RuntimeError: Expected object of scalar type Float but got scalar type Half for argument #2 'source' Traceback (most recent call last): File "/home/ds/anaconda3/envs/py36/lib/python3.6/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/home/ds/anaconda3/envs/py36/lib/python3.6/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 235, in main() File "/home/ds/anaconda3/envs/py36/lib/python3.6/site-packages/torch/distributed/launch.py", line 231, in main cmd=process.args) subprocess.CalledProcessError: Command '['/home/ds/anaconda3/envs/py36/bin/python', '-u', './tools/train.py', '--local_rank=0', 'configs/underwater/cas_x101/cascade_rcnn_x101_64x4d_fpn_1x.py', '--launcher', 'pytorch']' returned non-zero exit status 1. 期待您的回复

zhengye1995 commented 4 years ago

Try re-install mmcv package of version 0.2.16 like: pip install mmcv==0.2.16

zhangxiaohuixuhao commented 4 years ago

感谢大神的指导,是因为mmcv版本的问题,非常感谢!!!