Open Alexandre-Delplanque opened 4 years ago
I tried with the command : bash tools/dist_train.sh {CONFIG_FILE} 1
and it seems that the error disappeared.
However, another Runtime error appears:
Traceback (most recent call last):
File "tools/train.py", line 142, in <module>
main()
File "tools/train.py", line 138, in main
meta=meta)
File "/content/mmdetection/mmdet/apis/train.py", line 102, in train_detector
meta=meta)
File "/content/mmdetection/mmdet/apis/train.py", line 251, in _dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/epoch_based_runner.py", line 122, in run
epoch_runner(data_loaders[i], **kwargs)
File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/epoch_based_runner.py", line 35, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/content/mmdetection/mmdet/apis/train.py", line 75, in batch_processor
losses = model(**data)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/distributed.py", line 447, in forward
output = self.module(*inputs[0], **kwargs[0])
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/content/mmdetection/mmdet/core/fp16/decorators.py", line 49, in new_func
return old_func(*args, **kwargs)
File "/content/mmdetection/mmdet/models/detectors/base.py", line 137, in forward
return self.forward_train(img, img_meta, **kwargs)
File "/content/mmdetection/mmdet/models/detectors/single_stage.py", line 67, in forward_train
x = self.extract_feat(img)
File "/content/mmdetection/mmdet/models/detectors/single_stage.py", line 47, in extract_feat
x = self.backbone(img)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/content/mmdetection/mmdet/models/backbones/spinenet.py", line 259, in forward
target_feat = self.merge_ops[i]([block_feats[feat_idx] for feat_idx in spec.input_offsets])
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/content/mmdetection/mmdet/models/backbones/spinenet.py", line 146, in forward
target_feat = parent0_feat + parent1_feat
RuntimeError: The size of tensor a (125) must match the size of tensor b (126) at non-singleton dimension 3
Traceback (most recent call last):
File "/usr/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/usr/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 263, in <module>
main()
File "/usr/local/lib/python3.6/dist-packages/torch/distributed/launch.py", line 259, in main
cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python3', '-u', 'tools/train.py', '--local_rank=0', '/content/mmdetection/configs/spinenet/spinenet_190_B_8gpu.py', '--launcher', 'pytorch']' returned non-zero exit status 1.
cudnn_benchmark = True
# model settings
norm_cfg = dict(type='SyncBN', momentum=0.01, eps=1e-3, requires_grad=True)
model = dict(
type='RetinaNet',
backbone=dict(
type='SpineNet',
arch="190",
norm_cfg=norm_cfg),
neck=None,
bbox_head=dict(
type='RetinaSepBNHead',
num_classes=8,
num_ins=5,
in_channels=512,
stacked_convs=7,
feat_channels=512,
octave_base_scale=4,
scales_per_octave=3,
anchor_ratios=[0.5, 1.0, 2.0],
anchor_strides=[8, 16, 32, 64, 128],
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0],
norm_cfg=norm_cfg,
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0)))
# training and testing settings
train_cfg = dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0,
ignore_iof_thr=-1),
allowed_border=-1,
pos_weight=-1,
debug=False)
test_cfg = dict(
nms_pre=1000,
min_bbox_size=0,
score_thr=0.05,
nms=dict(type='nms', iou_thr=0.5),
max_per_img=100)
# dataset settings
dataset_type = 'MyDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='Resize',
img_scale=(1000, 1000),
ratio_range=(0.1, 1.9),
keep_ratio=True),
dict(type='RandomCrop', crop_size=(1000, 1000)),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size=(1000, 1000)),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1000, 1000),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=128),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
imgs_per_gpu=1,
workers_per_gpu=1,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/train_group_cocotype.json',
img_prefix=data_root + 'train/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/val_group_cocotype.json',
img_prefix=data_root + 'val/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/test_group_cocotype.json',
img_prefix=data_root + 'test/',
pipeline=test_pipeline))
evaluation = dict(interval=1, metric='bbox')
# optimizer
optimizer = dict(
type='SGD',
lr=0.001,
momentum=0.9,
weight_decay=4e-5,
paramwise_options=dict(norm_decay_mult=0))
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=8000,
warmup_ratio=0.1,
step=[5])
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
# dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 5
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/spinenet_190_B/'
load_from = None
resume_from = None
workflow = [('train', 1), ('val', 1)]
I used my own dataset, which consists of sub-frames of dimensions 1000x1000p with 8 classes (background included).
Thank you in advance for your help.
"SyncBN" is only for Multi-GPU. Set it to BN or GN for single GPU
Thank you. I've just tested to set to BN and GN and I get the same runtime error for BN and an assertion error for GN.
Traceback (most recent call last):
File "tools/train.py", line 142, in <module>
main()
File "tools/train.py", line 138, in main
meta=meta)
File "/content/mmdetection/mmdet/apis/train.py", line 111, in train_detector
meta=meta)
File "/content/mmdetection/mmdet/apis/train.py", line 305, in _non_dist_train
runner.run(data_loaders, cfg.workflow, cfg.total_epochs)
File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/epoch_based_runner.py", line 122, in run
epoch_runner(data_loaders[i], **kwargs)
File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/epoch_based_runner.py", line 35, in train
self.model, data_batch, train_mode=True, **kwargs)
File "/content/mmdetection/mmdet/apis/train.py", line 75, in batch_processor
losses = model(**data)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 150, in forward
return self.module(*inputs[0], **kwargs[0])
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/content/mmdetection/mmdet/core/fp16/decorators.py", line 49, in new_func
return old_func(*args, **kwargs)
File "/content/mmdetection/mmdet/models/detectors/base.py", line 137, in forward
return self.forward_train(img, img_meta, **kwargs)
File "/content/mmdetection/mmdet/models/detectors/single_stage.py", line 67, in forward_train
x = self.extract_feat(img)
File "/content/mmdetection/mmdet/models/detectors/single_stage.py", line 47, in extract_feat
x = self.backbone(img)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/content/mmdetection/mmdet/models/backbones/spinenet.py", line 259, in forward
target_feat = self.merge_ops[i]([block_feats[feat_idx] for feat_idx in spec.input_offsets])
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__
result = self.forward(*input, **kwargs)
File "/content/mmdetection/mmdet/models/backbones/spinenet.py", line 146, in forward
target_feat = parent0_feat + parent1_feat
RuntimeError: The size of tensor a (125) must match the size of tensor b (126) at non-singleton dimension 3
Traceback (most recent call last):
File "tools/train.py", line 142, in <module>
main()
File "tools/train.py", line 115, in main
cfg.model, train_cfg=cfg.train_cfg, test_cfg=cfg.test_cfg)
File "/content/mmdetection/mmdet/models/builder.py", line 43, in build_detector
return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
File "/content/mmdetection/mmdet/models/builder.py", line 15, in build
return build_from_cfg(cfg, registry, default_args)
File "/content/mmdetection/mmdet/utils/registry.py", line 79, in build_from_cfg
return obj_cls(**args)
File "/content/mmdetection/mmdet/models/detectors/retinanet.py", line 16, in __init__
test_cfg, pretrained)
File "/content/mmdetection/mmdet/models/detectors/single_stage.py", line 25, in __init__
self.backbone = builder.build_backbone(backbone)
File "/content/mmdetection/mmdet/models/builder.py", line 19, in build_backbone
return build(cfg, BACKBONES)
File "/content/mmdetection/mmdet/models/builder.py", line 15, in build
return build_from_cfg(cfg, registry, default_args)
File "/content/mmdetection/mmdet/utils/registry.py", line 79, in build_from_cfg
return obj_cls(**args)
File "/content/mmdetection/mmdet/models/backbones/spinenet.py", line 175, in __init__
self._make_stem_layer(in_channels)
File "/content/mmdetection/mmdet/models/backbones/spinenet.py", line 189, in _make_stem_layer
norm_cfg=self.norm_cfg)
File "/usr/local/lib/python3.6/dist-packages/mmcv/cnn/bricks/conv_module.py", line 136, in __init__
self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
File "/usr/local/lib/python3.6/dist-packages/mmcv/cnn/bricks/norm.py", line 112, in build_norm_layer
assert 'num_groups' in cfg_
AssertionError
I think the runtime error (BN) is due to something in train_pipeline but I'm not sure.
Let me reopen the discussion, have you found a cause of this error?
请问下,该问题最终是怎么解决的? Excuse me, how was the problem finally resolved?
Is it possible to train RetinaNet with SpineNet-190 backbone on single GPU using the classic mmdetection command
python tools/train.py ${CONFIG_FILE}
?I tried and I get this error:
Traceback (most recent call last): File "tools/train.py", line 142, in <module> main() File "tools/train.py", line 138, in main meta=meta) File "/content/mmdetection_b/mmdet/apis/train.py", line 111, in train_detector meta=meta) File "/content/mmdetection_b/mmdet/apis/train.py", line 305, in _non_dist_train runner.run(data_loaders, cfg.workflow, cfg.total_epochs) File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/runner.py", line 384, in run epoch_runner(data_loaders[i], **kwargs) File "/usr/local/lib/python3.6/dist-packages/mmcv/runner/runner.py", line 283, in train self.model, data_batch, train_mode=True, **kwargs) File "/content/mmdetection_b/mmdet/apis/train.py", line 75, in batch_processor losses = model(**data) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/data_parallel.py", line 150, in forward return self.module(*inputs[0], **kwargs[0]) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/content/mmdetection_b/mmdet/core/fp16/decorators.py", line 49, in new_func return old_func(*args, **kwargs) File "/content/mmdetection_b/mmdet/models/detectors/base.py", line 137, in forward return self.forward_train(img, img_meta, **kwargs) File "/content/mmdetection_b/mmdet/models/detectors/single_stage.py", line 67, in forward_train x = self.extract_feat(img) File "/content/mmdetection_b/mmdet/models/detectors/single_stage.py", line 47, in extract_feat x = self.backbone(img) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/content/mmdetection_b/mmdet/models/backbones/spinenet.py", line 251, in forward feat = self.maxpool(self.conv1(input)) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/mmcv/cnn/bricks/conv_module.py", line 181, in forward x = self.norm(x) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 532, in __call__ result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/batchnorm.py", line 458, in forward world_size = torch.distributed.get_world_size(process_group) File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 586, in get_world_size return _get_group_size(group) File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 202, in _get_group_size _check_default_pg() File "/usr/local/lib/python3.6/dist-packages/torch/distributed/distributed_c10d.py", line 193, in _check_default_pg "Default process group is not initialized" AssertionError: Default process group is not initialized
Thanks for your help.