Open GuoWei2019 opened 4 years ago
@GuoWei2019 sorry, it is a bug and has been fixed in the latest commit.
@tianzhi0549 I conter the same question. I clone this code yesterday, and run train with only 1 gpu, and run commonds as follows: 'python -m torch.distributed.launch --nproc_per_node=1 --master_port=$((RANDOM + 10000)) tools/train_net.py --config-file configs/fcos/fcos_syncbn_bs32_c128_MNV2_FPN_1x.yaml DATALOADER.NUM_WORKERS 2 OUTPUT_DIR training_dir/fcos_syncbn_bs32_c128_MNV2_FPN_1x'
File "tools/train_net.py", line 173, in main model = train(cfg, args.local_rank, args.distributed) File "tools/train_net.py", line 79, in train arguments, File "/nfs/project/wangzhihui/programs/FCOS/fcos_core/engine/trainer.py", line 69, in do_train loss_dict = model(images, targets) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(*input, kwargs) File "/nfs/project/wangzhihui/programs/FCOS/fcos_core/modeling/detector/generalized_rcnn.py", line 49, in forward features = self.backbone(images.tensors) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(*input, *kwargs) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/container.py", line 100, in forward input = module(input) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(input, kwargs) File "/nfs/project/wangzhihui/programs/FCOS/fcos_core/modeling/backbone/mobilenet.py", line 116, in forward x = m(x) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(*input, *kwargs) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/container.py", line 100, in forward input = module(input) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 532, in call result = self.forward(input, **kwargs) File "/home/luban/.local/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py", line 458, in forward world_size = torch.distributed.get_world_size(process_group) File "/home/luban/.local/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 586, in get_world_size return _get_group_size(group) File "/home/luban/.local/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 202, in _get_group_size _check_default_pg() File "/home/luban/.local/lib/python3.6/site-packages/torch/distributed/distributed_c10d.py", line 193, in _check_default_pg "Default process group is not initialized" AssertionError: Default process group is not initialized
warnings.warn("torch.distributed.reduce_op is deprecated, please use " Traceback (most recent call last): File "tools/train_net.py", line 180, in
main()
File "tools/train_net.py", line 173, in main
model = train(cfg, args.local_rank, args.distributed)
File "tools/train_net.py", line 79, in train
arguments,
File "/FCOS/fcos_core/engine/trainer.py", line 69, in do_train
loss_dict = model(images, targets)
File "/anaconda3/envs/FCOS/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call
result = self.forward(*input, *kwargs)
File "/FCOS/fcos_core/modeling/detector/generalized_rcnn.py", line 50, in forward
proposals, proposal_losses = self.rpn(images, features, targets)
File "/anaconda3/envs/FCOS/lib/python3.7/site-packages/torch/nn/modules/module.py", line 493, in call
result = self.forward(input, **kwargs)
File "/FCOS/fcos_core/modeling/rpn/fcos/fcos.py", line 159, in forward
centerness, targets
File "/FCOS/fcos_core/modeling/rpn/fcos/fcos.py", line 169, in _forward_train
locations, box_cls, box_regression, centerness, targets
File "/FCOS/fcos_core/modeling/rpn/fcos/loss.py", line 282, in call
reduce_sum(centerness_flatten.new_tensor([0.0]))
File "/FCOS/fcos_core/modeling/rpn/fcos/loss.py", line 29, in reduce_sum
dist.all_reduce(tensor, op=dist.reduce_op.SUM)
File "/anaconda3/envs/FCOS/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 899, in all_reduce
_check_default_pg()
File "/anaconda3/envs/FCOS/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 191, in _check_default_pg
"Default process group is not initialized"
AssertionError: Default process group is not initialized