Closed tino926 closed 2 years ago
when i want to train yolox with multiple gpus, I got such errors:
2021-10-13 09:44:36 | INFO | yolox.core.trainer:151 - init prefetcher, this might take one minute or less... 2021-10-13 09:44:51 | ERROR | yolox.core.launch:147 - An error has been caught in function '_distributed_worker', process 'SpawnProcess-1' (9795), thread 'MainThread' (140328941930304): Traceback (most recent call last): File "<string>", line 1, in <module> File "/usr/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main exitcode = _main(fd, parent_sentinel) │ │ └ 5 │ └ 8 └ <function _main at 0x7fa0df4b6700> File "/usr/lib/python3.8/multiprocessing/spawn.py", line 129, in _main return self._bootstrap(parent_sentinel) │ │ └ 5 │ └ <function BaseProcess._bootstrap at 0x7fa0df5be550> └ <SpawnProcess name='SpawnProcess-1' parent=9759 started> File "/usr/lib/python3.8/multiprocessing/process.py", line 313, in _bootstrap self.run() │ └ <function BaseProcess.run at 0x7fa0df5b9b80> └ <SpawnProcess name='SpawnProcess-1' parent=9759 started> File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run self._target(*self._args, **self._kwargs) │ │ │ │ │ └ {} │ │ │ │ └ <SpawnProcess name='SpawnProcess-1' parent=9759 started> │ │ │ └ (<function _distributed_worker at 0x7fa06c82c160>, 0, (<function main at 0x7fa06c82c4c0>, 2, 2, 0, 'nccl', 'tcp://127.0.0.1:3... │ │ └ <SpawnProcess name='SpawnProcess-1' parent=9759 started> │ └ <function _wrap at 0x7fa0dcc9e3a0> └ <SpawnProcess name='SpawnProcess-1' parent=9759 started> File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 19, in _wrap fn(i, *args) │ │ └ (<function main at 0x7fa06c82c4c0>, 2, 2, 0, 'nccl', 'tcp://127.0.0.1:33517', (╒══════════════════╤══════════════════════════... │ └ 0 └ <function _distributed_worker at 0x7fa06c82c160> > File "/home/tino/curr_proj/yolov5_working_129/yolox_20210929_48316/yolox/core/launch.py", line 147, in _distributed_worker main_func(*args) │ └ (╒══════════════════╤════════════════════════════════════════════════════════════════════════════════════════════════════════... └ <function main at 0x7fa06c82c4c0> File "/home/tino/curr_proj/yolov5_working_129/yolox_20210929_48316/tools/train.py", line 110, in main trainer.train() │ └ <function Trainer.train at 0x7fa06c5b4f70> └ <yolox.core.trainer.Trainer object at 0x7fa06a08c2e0> File "/home/tino/curr_proj/yolov5_working_129/yolox_20210929_48316/yolox/core/trainer.py", line 70, in train self.before_train() │ └ <function Trainer.before_train at 0x7fa06a2fdf70> └ <yolox.core.trainer.Trainer object at 0x7fa06a08c2e0> File "/home/tino/curr_proj/yolov5_working_129/yolox_20210929_48316/yolox/core/trainer.py", line 152, in before_train self.prefetcher = DataPrefetcher(self.train_loader) │ │ │ └ <yolox.data.dataloading.DataLoader object at 0x7fa08c914fa0> │ │ └ <yolox.core.trainer.Trainer object at 0x7fa06a08c2e0> │ └ <class 'yolox.data.data_prefetcher.DataPrefetcher'> └ <yolox.core.trainer.Trainer object at 0x7fa06a08c2e0> File "/home/tino/curr_proj/yolov5_working_129/yolox_20210929_48316/yolox/data/data_prefetcher.py", line 17, in __init__ self.loader = iter(loader) │ └ <yolox.data.dataloading.DataLoader object at 0x7fa08c914fa0> └ <yolox.data.data_prefetcher.DataPrefetcher object at 0x7fa08c914ca0> File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 352, in __iter__ return self._get_iterator() │ └ <function DataLoader._get_iterator at 0x7fa0798ab820> └ <yolox.data.dataloading.DataLoader object at 0x7fa08c914fa0> File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 294, in _get_iterator return _MultiProcessingDataLoaderIter(self) │ └ <yolox.data.dataloading.DataLoader object at 0x7fa08c914fa0> └ <class 'torch.utils.data.dataloader._MultiProcessingDataLoaderIter'> File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 801, in __init__ w.start() │ └ <function BaseProcess.start at 0x7fa0df5b9c10> └ <Process name='Process-1:3' parent=9795 initial daemon> File "/usr/lib/python3.8/multiprocessing/process.py", line 121, in start self._popen = self._Popen(self) │ │ │ │ └ <Process name='Process-1:3' parent=9795 initial daemon> │ │ │ └ <staticmethod object at 0x7fa0df5f6340> │ │ └ <Process name='Process-1:3' parent=9795 initial daemon> │ └ None └ <Process name='Process-1:3' parent=9795 initial daemon> File "/usr/lib/python3.8/multiprocessing/context.py", line 224, in _Popen return _default_context.get_context().Process._Popen(process_obj) │ │ └ <Process name='Process-1:3' parent=9795 initial daemon> │ └ <function DefaultContext.get_context at 0x7fa0df54f670> └ <multiprocessing.context.DefaultContext object at 0x7fa0df5c24f0> File "/usr/lib/python3.8/multiprocessing/context.py", line 283, in _Popen return Popen(process_obj) │ └ <Process name='Process-1:3' parent=9795 initial daemon> └ <class 'multiprocessing.popen_spawn_posix.Popen'> File "/usr/lib/python3.8/multiprocessing/popen_spawn_posix.py", line 32, in __init__ super().__init__(process_obj) └ <Process name='Process-1:3' parent=9795 initial daemon> File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 19, in __init__ self._launch(process_obj) │ │ └ <Process name='Process-1:3' parent=9795 initial daemon> │ └ <function Popen._launch at 0x7fa06a0840d0> └ <multiprocessing.popen_spawn_posix.Popen object at 0x7f9ff147c550> File "/usr/lib/python3.8/multiprocessing/popen_spawn_posix.py", line 58, in _launch self.pid = util.spawnv_passfds(spawn.get_executable(), │ │ │ │ └ <function get_executable at 0x7fa0df4b6430> │ │ │ └ <module 'multiprocessing.spawn' from '/usr/lib/python3.8/multiprocessing/spawn.py'> │ │ └ <function spawnv_passfds at 0x7fa0df4b6310> │ └ <module 'multiprocessing.util' from '/usr/lib/python3.8/multiprocessing/util.py'> └ <multiprocessing.popen_spawn_posix.Popen object at 0x7f9ff147c550> File "/usr/lib/python3.8/multiprocessing/util.py", line 429, in spawnv_passfds return _posixsubprocess.fork_exec( │ └ <built-in function fork_exec> └ <module '_posixsubprocess' (built-in)> OSError: [Errno 12] Cannot allocate memory Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa0798b3790> Traceback (most recent call last): File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1203, in __del__ self._shutdown_workers() File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1174, in _shutdown_workers if self._persistent_workers or self._workers_status[worker_id]: AttributeError: '_MultiProcessingDataLoaderIter' object has no attribute '_workers_status' ^CTraceback (most recent call last): File "tools/train.py", line 125, in <module> launch( File "/home/tino/curr_proj/yolov5_working_129/yolox_20210929_48316/yolox/core/launch.py", line 82, in launch mp.start_processes( File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 157, in start_processes while not context.join(): File "/home/tino/venv/for_yolox/lib/python3.8/site-packages/torch/multiprocessing/spawn.py", line 75, in join ready = multiprocessing.connection.wait( File "/usr/lib/python3.8/multiprocessing/connection.py", line 930, in wait ready = selector.select(timeout) File "/usr/lib/python3.8/selectors.py", line 415, in select fd_event_list = self._selector.poll(timeout) KeyboardInterrupt ^CError in atexit._run_exitfuncs: Traceback (most recent call last): File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 27, in poll pid, sts = os.waitpid(self.pid, flag) KeyboardInterrupt
this is the command I used:
python tools/train.py -f exps/example/custom/yolox_tiny_r4.py -d 2 -b 16 --resume
OSError: [Errno 12] Cannot allocate memory tells you what's wrong.
OSError: [Errno 12] Cannot allocate memory
Reducing num_workers might help you.
num_workers
This worked. Thanks for the suggestion.
Regards, Debapriya
when i want to train yolox with multiple gpus, I got such errors:
this is the command I used: