RangiLyu / nanodet

NanoDet-Plus⚡Super fast and lightweight anchor-free object detection model. 🔥Only 980 KB(int8) / 1.8MB (fp16) and run 97FPS on cellphone🔥
Apache License 2.0
5.77k stars 1.04k forks source link

RuntimeError: DataLoader worker (pid(s) 10156, 4764, 12160, 20160) exited unexpectedly #569

Open hizwj opened 4 months ago

hizwj commented 4 months ago

At the beginning of training, it was normal:

[NanoDet][06-29 15:22:19]INFO:Train|Epoch1/30|Iter0(1/8)| mem:2.61G| lr:1.00e-07| loss_qfl:0.5918| loss_bbox:1.1593| loss_dfl:0.5198| aux_loss_qfl:0.5985| aux_loss_bbox:1.1496| aux_loss_dfl:0.5269| INFO:NanoDet:Train|Epoch1/30|Iter0(1/8)| mem:2.61G| lr:1.00e-07| loss_qfl:0.5918| loss_bbox:1.1593| loss_dfl:0.5198| aux_loss

but every time I reached the 10th epoch, It would report an error: Traceback (most recent call last): File "", line 1, in File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main exitcode = _main(fd, parent_sentinel) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main prepare(preparation_data) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare _fixup_main_from_path(data['init_main_from_path']) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path main_content = runpy.run_path(main_path, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path return _run_module_code(code, init_globals, run_name, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in _run_module_code _run_code(code, mod_globals, init_globals, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in _run_code exec(code, run_globals) File "E:\py_project\nanodet-main\tools\train.py", line 19, in import pytorch_lightning as pl File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning__init.py", line 34, in from lightning_fabric.utilities.seed import seed_everything # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric__init.py", line 23, in from lightning_fabric.fabric import Fabric # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in import torch File "D:\Anaconda\envs\nanodet\lib\site-packages\torch__init.py", line 128, in raise err OSError: [WinError 1455] 页面文件太小,无法完成操作。 Error loading "D:\Anaconda\envs\nanodet\lib\site-packages\torch\lib\cufft64_10.dll" or one of its dependencies. Traceback (most recent call last): File "", line 1, in File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main exitcode = _main(fd, parent_sentinel) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main prepare(preparation_data) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare _fixup_main_from_path(data['init_main_from_path']) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path main_content = runpy.run_path(main_path, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path return _run_module_code(code, init_globals, run_name, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in _run_module_code _run_code(code, mod_globals, init_globals, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in _run_code exec(code, run_globals) File "E:\py_project\nanodet-main\tools\train.py", line 19, in import pytorch_lightning as pl File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning__init.py", line 34, in from lightning_fabric.utilities.seed import seed_everything # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric__init.py", line 23, in from lightning_fabric.fabric import Fabric # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in import torch File "D:\Anaconda\envs\nanodet\lib\site-packages\torch__init.py", line 676, in from .storage import _StorageBase, TypedStorage, _LegacyStorage, UntypedStorage File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\storage.py", line 11, in import numpy as np File "D:\Anaconda\envs\nanodet\lib\site-packages\numpy\init.py", line 154, in from . import ma File "D:\Anaconda\envs\nanodet\lib\site-packages\numpy\ma\init__.py", line 42, in from . import core File "", line 991, in _find_and_load Traceback (most recent call last): File "", line 1, in File "", line 975, in _find_and_load_unlocked File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main File "", line 671, in _load_unlocked exitcode = _main(fd, parent_sentinel) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main prepare(preparation_data) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare File "", line 839, in exec_module _fixup_main_from_path(data['init_main_from_path']) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path File "", line 934, in get_code main_content = runpy.run_path(main_path, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path return _run_module_code(code, init_globals, run_name, File "", line 1033, in get_data File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in _run_module_code _run_code(code, mod_globals, init_globals, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in _run_code MemoryError exec(code, run_globals) File "E:\py_project\nanodet-main\tools\train.py", line 19, in import pytorch_lightning as pl File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\init__.py", line 34, in from lightning_fabric.utilities.seed import seed_everything # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\init.py", line 23, in from lightning_fabric.fabric import Fabric # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in import torch File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\init__.py", line 218, in from torch._C import * # noqa: F403 RuntimeError: MemoryError: Out of memory interning an attribute name Traceback (most recent call last): File "", line 1, in File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 116, in spawn_main exitcode = _main(fd, parent_sentinel) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 125, in _main prepare(preparation_data) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 236, in prepare _fixup_main_from_path(data['init_main_from_path']) File "D:\Anaconda\envs\nanodet\lib\multiprocessing\spawn.py", line 287, in _fixup_main_from_path main_content = runpy.run_path(main_path, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 265, in run_path return _run_module_code(code, init_globals, run_name, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 97, in _run_module_code _run_code(code, mod_globals, init_globals, File "D:\Anaconda\envs\nanodet\lib\runpy.py", line 87, in _run_code exec(code, run_globals) File "E:\py_project\nanodet-main\tools\train.py", line 19, in import pytorch_lightning as pl File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\init__.py", line 34, in from lightning_fabric.utilities.seed import seed_everything # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\init.py", line 23, in from lightning_fabric.fabric import Fabric # noqa: E402 File "D:\Anaconda\envs\nanodet\lib\site-packages\lightning_fabric\fabric.py", line 21, in import torch File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\init.py", line 831, in from .functional import * # noqa: F403 File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\functional.py", line 8, in import torch.nn.functional as F File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn\init.py", line 1, in from .modules import * # noqa: F403 File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn\modules\init__.py", line 18, in from .batchnorm import BatchNorm1d, BatchNorm2d, BatchNorm3d, SyncBatchNorm, \ File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn\modules\batchnorm.py", line 9, in from ._functions import SyncBatchNorm as sync_batch_norm File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\nn\modules_functions.py", line 4, in from torch.autograd.function import Function File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\autograd\init.py", line 21, in from . import functional File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\autograd\functional.py", line 3, in from . import forward_ad as fwAD File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\autograd\forward_ad.py", line 106, in _UnpackedDualTensor = namedtuple('_UnpackedDualTensor', ['primal', 'tangent']) File "D:\Anaconda\envs\nanodet\lib\collections\init__.py", line 394, in namedtuple exec(s, namespace) MemoryError [NanoDet][06-29 15:28:13]INFO:Val|Epoch10/30|Iter80(1/2)| mem:2.79G| lr:1.58e-04| loss_qfl:0.9156| loss_bbox:0.9682| loss_dfl:0.4420| aux_loss_qfl:0.4186| aux_loss_bbox:0.7197| aux_loss_dfl:0.3550| INFO:NanoDet:Val|Epoch10/30|Iter80(1/2)| mem:2.79G| lr:1.58e-04| loss_qfl:0.9156| loss_bbox:0.9682| loss_dfl:0.4420| aux_loss_qfl:0.4186| aux_loss_bbox:0.7197| aux_loss_dfl:0.3550|

Traceback (most recent call last): File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1120, in _try_get_data data = self._data_queue.get(timeout=timeout) File "D:\Anaconda\envs\nanodet\lib\queue.py", line 178, in get raise Empty _queue.Empty

The above exception was the direct cause of the following exception:

Traceback (most recent call last): File "tools/train.py", line 156, in main(args) File "tools/train.py", line 151, in main trainer.fit(task, train_dataloader, val_dataloader, ckpt_path=model_resume_path) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 608, in fit call._call_and_handle_interrupt( File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\call.py", line 38, in _call_and_handle_interrupt return trainer_fn(*args, kwargs) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 650, in _fit_impl self._run(model, ckpt_path=self.ckpt_path) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1112, in _run results = self._run_stage() File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1191, in _run_stage self._run_train() File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\trainer\trainer.py", line 1214, in _run_train self.fit_loop.run() File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 199, in run self.advance(*args, *kwargs) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\fit_loop.py", line 267, in advance self._outputs = self.epoch_loop.run(self._data_fetcher) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 200, in run self.on_advance_end() File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\epoch\training_epoch_loop.py", line 250, in on_advance_end self._run_validation() File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\epoch\training_epoch_loop.py", line 308, in _run_validation self.val_loop.run() File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 199, in run self.advance(args, kwargs) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\dataloader\evaluation_loop.py", line 152, in advance dl_outputs = self.epoch_loop.run(self._data_fetcher, dl_max_batches, kwargs) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\loop.py", line 199, in run self.advance(*args, **kwargs) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\loops\epoch\evaluation_epoch_loop.py", line 121, in advance batch = next(data_fetcher) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\utilities\fetching.py", line 184, in next return self.fetching_function() File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\utilities\fetching.py", line 265, in fetching_function self._fetch_next_batch(self.dataloader_iter) File "D:\Anaconda\envs\nanodet\lib\site-packages\pytorch_lightning\utilities\fetching.py", line 280, in _fetch_next_batch batch = next(iterator) File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 628, in next data = self._next_data() File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1316, in _next_data idx, data = self._get_data() File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1272, in _get_data success, data = self._try_get_data() File "D:\Anaconda\envs\nanodet\lib\site-packages\torch\utils\data\dataloader.py", line 1133, in _try_get_data raise RuntimeError('DataLoader worker (pid(s) {}) exited unexpectedly'.format(pids_str)) from e RuntimeError: DataLoader worker (pid(s) 10156, 4764, 12160, 20160) exited unexpectedly

what wrong with it?what should i do to solve this problem?