Open shireen1512 opened 8 months ago
Hello, Im trying to train ocrnet_hr18, and i'm using the jupyter notebook downloaded from colab. by running this line : runner.train() i get the following error :
runner.train()
File c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:630, in _BaseDataLoaderIter.next(self) 627 if self._sampler_iter is None: 628 # TODO(https://github.com/pytorch/pytorch/issues/76750) 629 self._reset() # type: ignore[call-arg] --> 630 data = self._next_data() 631 self._num_yielded += 1 632 if self._dataset_kind == _DatasetKind.Iterable and \ 633 self._IterableDataset_len_called is not None and \ 634 self._num_yielded > self._IterableDataset_len_called:
File c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:1328, in _MultiProcessingDataLoaderIter._next_data(self) 1325 return self._process_data(data) 1327 assert not self._shutdown and self._tasks_outstanding > 0 -> 1328 idx, data = self._get_data() 1329 self._tasks_outstanding -= 1 1330 if self._dataset_kind == _DatasetKind.Iterable: 1331 # Check for _IterableDatasetStopIteration
File c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:1294, in _MultiProcessingDataLoaderIter._get_data(self) 1290 # In this case, self._data_queue is a queue.Queue,. But we don't 1291 # need to call .task_done() because we don't use .join(). 1292 else: 1293 while True: -> 1294 success, data = self._try_get_data() 1295 if success: 1296 return data
self._data_queue
queue.Queue
.task_done()
.join()
File c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:1145, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout) 1143 if len(failed_workers) > 0: 1144 pids_str = ', '.join(str(w.pid) for w in failed_workers) -> 1145 raise RuntimeError(f'DataLoader worker (pid(s) {pids_str}) exited unexpectedly') from e 1146 if isinstance(e, queue.Empty): 1147 return (False, None)
RuntimeError: DataLoader worker (pid(s) 35132, 35468) exited unexpectedly
Here is how i created the config according to colab notebook :
cfg.norm_cfg = dict(type='BN', requires_grad=True) cfg.crop_size = (256, 256) cfg.model.data_preprocessor.size = cfg.crop_size cfg.model.backbone.norm_cfg = cfg.norm_cfg # Modify num classes of the model in decode/auxiliary head for head in cfg.model.decode_head: head['num_classes'] = 7 head['norm_cfg'] = cfg.norm_cfg # Modify dataset type and path cfg.dataset_type = 'myDataset' cfg.data_root = "dataset" cfg.train_dataloader.batch_size = 4 cfg.train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations'), dict(type='RandomResize', scale=(320, 240), ratio_range=(0.5, 2.0), keep_ratio=True), dict(type='RandomCrop', crop_size=cfg.crop_size, cat_max_ratio=0.75), dict(type='RandomFlip', prob=0.5), dict(type='PackSegInputs') ] cfg.test_pipeline = [ dict(type='LoadImageFromFile'), dict(type='Resize', scale=(320, 240), keep_ratio=True), dict(type='LoadAnnotations'), dict(type='PackSegInputs') ] cfg.train_dataloader.dataset.type = cfg.dataset_type cfg.train_dataloader.dataset.data_root = cfg.data_root cfg.train_dataloader.dataset.data_prefix = dict(img_path=img_dir, seg_map_path=ann_dir) cfg.train_dataloader.dataset.pipeline = cfg.train_pipeline cfg.train_dataloader.dataset.ann_file = 'splits/train.txt' cfg.val_dataloader.dataset.type = cfg.dataset_type cfg.val_dataloader.dataset.data_root = cfg.data_root cfg.val_dataloader.dataset.data_prefix = dict(img_path=img_dir, seg_map_path=ann_dir) cfg.val_dataloader.dataset.pipeline = cfg.test_pipeline cfg.val_dataloader.dataset.ann_file = 'splits/val.txt' cfg.test_dataloader = cfg.val_dataloader # Load the pretrained weights cfg.load_from = 'ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320-401c5bdd.pth' # Set up working dir to save files and logs. cfg.work_dir = '/work_dir/trainwgoogleearth' cfg.train_cfg.max_iters = 10000 cfg.train_cfg.val_interval = 100 cfg.default_hooks.logger.interval = 10 cfg.default_hooks.checkpoint.interval = 500 # Set seed to facilitate reproducing the result cfg['randomness'] = dict(seed=0) # Let's have a look at the final config used for training print(f'Config:\n{cfg.pretty_text}')
could anyone help with the error please? i have intalled the mmcv with pip, could that be the issue ? here are the versions of everything :
Torch = 2.1.0 Cuda = 12.1 TorchVision= 0.16.0 mmseg = 1.2.2 OpenCV= 4.9.0 MMEngine= 0.10.3 MMCV= 2.1.0 MMCV Compiler= MSVC 192930148 MMCV CUDA Compiler= 12.1
It seems that Jupyter does not support creating subprocesses. Please try setting num_workers=1.
Hello, Im trying to train ocrnet_hr18, and i'm using the jupyter notebook downloaded from colab. by running this line :
runner.train()
i get the following error :File c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:630, in _BaseDataLoaderIter.next(self) 627 if self._sampler_iter is None: 628 # TODO(https://github.com/pytorch/pytorch/issues/76750) 629 self._reset() # type: ignore[call-arg] --> 630 data = self._next_data() 631 self._num_yielded += 1 632 if self._dataset_kind == _DatasetKind.Iterable and \ 633 self._IterableDataset_len_called is not None and \ 634 self._num_yielded > self._IterableDataset_len_called:
File c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:1328, in _MultiProcessingDataLoaderIter._next_data(self) 1325 return self._process_data(data) 1327 assert not self._shutdown and self._tasks_outstanding > 0 -> 1328 idx, data = self._get_data() 1329 self._tasks_outstanding -= 1 1330 if self._dataset_kind == _DatasetKind.Iterable: 1331 # Check for _IterableDatasetStopIteration
File c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:1294, in _MultiProcessingDataLoaderIter._get_data(self) 1290 # In this case,
self._data_queue
is aqueue.Queue
,. But we don't 1291 # need to call.task_done()
because we don't use.join()
. 1292 else: 1293 while True: -> 1294 success, data = self._try_get_data() 1295 if success: 1296 return dataFile c:\Users\Shireen\anaconda3\envs\mmsegmentation\lib\site-packages\torch\utils\data\dataloader.py:1145, in _MultiProcessingDataLoaderIter._try_get_data(self, timeout) 1143 if len(failed_workers) > 0: 1144 pids_str = ', '.join(str(w.pid) for w in failed_workers) -> 1145 raise RuntimeError(f'DataLoader worker (pid(s) {pids_str}) exited unexpectedly') from e 1146 if isinstance(e, queue.Empty): 1147 return (False, None)
RuntimeError: DataLoader worker (pid(s) 35132, 35468) exited unexpectedly
Here is how i created the config according to colab notebook :
could anyone help with the error please? i have intalled the mmcv with pip, could that be the issue ? here are the versions of everything :
Torch = 2.1.0 Cuda = 12.1 TorchVision= 0.16.0 mmseg = 1.2.2 OpenCV= 4.9.0 MMEngine= 0.10.3 MMCV= 2.1.0 MMCV Compiler= MSVC 192930148 MMCV CUDA Compiler= 12.1