when I run
export GPU=2 && LR=0.01 && CUDA_VISIBLE_DEVICES=0,1 PORT=10000 tools/dist_train.sh configs2/TinyPerson/base/faster_rcnn_r50_fpn_1x_TinyPerson640.py $GPU \ --work-dir ../TOV_mmdetection_cache/work_dir/TinyPerson/Base/faster_rcnn_r50_fpn_1x_TinyPerson640/old640x512_lr${LR}_1x_${GPU}g/ \ --cfg-options optimizer.lr=${LR}
error occured:
/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects --local_rank argument to be set, please
change it to read from os.environ['LOCAL_RANK'] instead. See
https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
FutureWarning,
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last):
File "tools/train.py", line 15, in
from mmdet.apis import set_random_seed, train_detector
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/init.py", line 1, in
from .inference import (async_inference_detector, inference_detector,
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/inference.py", line 6, in
from mmcv.ops import RoIPool
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/init.py", line 2, in
from .assign_score_withk import assign_score_withk
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/assign_score_withk.py", line 6, in
'_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/utils/ext_loader.py", line 13, in load_ext
ext = importlib.import_module('mmcv.' + name)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
ImportError: libcudart.so.11.0: cannot open shared object file: No such file or directory
Traceback (most recent call last):
File "tools/train.py", line 15, in
from mmdet.apis import set_random_seed, train_detector
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/init.py", line 1, in
from .inference import (async_inference_detector, inference_detector,
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/inference.py", line 6, in
from mmcv.ops import RoIPool
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/init.py", line 2, in
from .assign_score_withk import assign_score_withk
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/assign_score_withk.py", line 6, in
'_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/utils/ext_loader.py", line 13, in load_ext
ext = importlib.import_module('mmcv.' + name)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
ImportError: libcudart.so.11.0: cannot open shared object file: No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 252455) of binary: /media/host/fwq/Devdata/anaconda3/envs/open-mmlab/bin/python
Traceback (most recent call last):
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/run.py", line 755, in run
)(*cmd_args)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 247, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
when I run
export GPU=2 && LR=0.01 && CUDA_VISIBLE_DEVICES=0,1 PORT=10000 tools/dist_train.sh configs2/TinyPerson/base/faster_rcnn_r50_fpn_1x_TinyPerson640.py $GPU \ --work-dir ../TOV_mmdetection_cache/work_dir/TinyPerson/Base/faster_rcnn_r50_fpn_1x_TinyPerson640/old640x512_lr${LR}_1x_${GPU}g/ \ --cfg-options optimizer.lr=${LR}
error occured: /media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py:186: FutureWarning: The module torch.distributed.launch is deprecated and will be removed in future. Use torchrun. Note that --use_env is set by default in torchrun. If your script expects--local_rank
argument to be set, please change it to read fromos.environ['LOCAL_RANK']
instead. See https://pytorch.org/docs/stable/distributed.html#launch-utility for further instructionsFutureWarning, WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last): File "tools/train.py", line 15, in
from mmdet.apis import set_random_seed, train_detector
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/init.py", line 1, in
from .inference import (async_inference_detector, inference_detector,
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/inference.py", line 6, in
from mmcv.ops import RoIPool
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/init.py", line 2, in
from .assign_score_withk import assign_score_withk
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/assign_score_withk.py", line 6, in
'_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/utils/ext_loader.py", line 13, in load_ext
ext = importlib.import_module('mmcv.' + name)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
ImportError: libcudart.so.11.0: cannot open shared object file: No such file or directory
Traceback (most recent call last):
File "tools/train.py", line 15, in
from mmdet.apis import set_random_seed, train_detector
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/init.py", line 1, in
from .inference import (async_inference_detector, inference_detector,
File "/media/host/fwq/Devdata/lsy/PointTinyBenchmark/TOV_mmdetection/mmdet/apis/inference.py", line 6, in
from mmcv.ops import RoIPool
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/init.py", line 2, in
from .assign_score_withk import assign_score_withk
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/ops/assign_score_withk.py", line 6, in
'_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/mmcv/utils/ext_loader.py", line 13, in load_ext
ext = importlib.import_module('mmcv.' + name)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/importlib/init.py", line 127, in import_module
return _bootstrap._gcd_import(name[level:], package, level)
ImportError: libcudart.so.11.0: cannot open shared object file: No such file or directory
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 252455) of binary: /media/host/fwq/Devdata/anaconda3/envs/open-mmlab/bin/python
Traceback (most recent call last):
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/run.py", line 755, in run
)(*cmd_args)
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/media/host/fwq/Devdata/anaconda3/envs/open-mmlab/lib/python3.7/site-packages/torch/distributed/launcher/api.py", line 247, in launch_agent
failures=result.failures,
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
tools/train.py FAILED
Failures: [1]: time : 2022-09-19_16:41:33 host : host rank : 1 (local_rank: 1) exitcode : 1 (pid: 252456) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
Root Cause (first observed failure): [0]: time : 2022-09-19_16:41:33 host : host rank : 0 (local_rank: 0) exitcode : 1 (pid: 252455) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
How can I solve this problem? Thank you.