MIC-DKFZ / nnUNet

Apache License 2.0
5.9k stars 1.76k forks source link

RuntimeError: Some background workers are no longer alive #2036

Closed congcongwy51 closed 7 months ago

congcongwy51 commented 7 months ago

(wy) root@autodl-container-28da11ab52-7d6a74d6:~/nnUNet/nnunetv2# OMP_NUM_THREADS=1 nnUNetv2_train 1 3d_lowres 0 --c Using device: cuda:0

####################################################################### Please cite the following paper when using nnU-Net: Isensee, F., Jaeger, P. F., Kohl, S. A., Petersen, J., & Maier-Hein, K. H. (2021). nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation. Nature methods, 18(2), 203-211. #######################################################################

This is the configuration used by this training: Configuration name: 3d_lowres {'data_identifier': 'nnUNetPlans_3d_lowres', 'preprocessor_name': 'DefaultPreprocessor', 'batch_size': 2, 'patch_size': [64, 192, 192], 'median_image_size_in_voxels': [103, 296, 296], 'spacing': [3.6896215962746104, 1.879049723441106, 1.879049723441106], 'normalization_schemes': ['CTNormalization'], 'use_mask_for_norm': [False], 'resampling_fn_data': 'resample_data_or_seg_to_shape', 'resampling_fn_seg': 'resample_data_or_seg_to_shape', 'resampling_fn_data_kwargs': {'is_seg': False, 'order': 3, 'order_z': 0, 'force_separate_z': None}, 'resampling_fn_seg_kwargs': {'is_seg': True, 'order': 1, 'order_z': 0, 'force_separate_z': None}, 'resampling_fn_probabilities': 'resample_data_or_seg_to_shape', 'resampling_fn_probabilities_kwargs': {'is_seg': False, 'order': 1, 'order_z': 0, 'force_separate_z': None}, 'architecture': {'network_class_name': 'dynamic_network_architectures.architectures.unet.PlainConvUNet', 'arch_kwargs': {'n_stages': 6, 'features_per_stage': [32, 64, 128, 256, 320, 320], 'conv_op': 'torch.nn.modules.conv.Conv3d', 'kernel_sizes': [[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3]], 'strides': [[1, 1, 1], [2, 2, 2], [2, 2, 2], [2, 2, 2], [2, 2, 2], [1, 2, 2]], 'n_conv_per_stage': [2, 2, 2, 2, 2, 2], 'n_conv_per_stage_decoder': [2, 2, 2, 2, 2], 'conv_bias': True, 'norm_op': 'torch.nn.modules.instancenorm.InstanceNorm3d', 'norm_op_kwargs': {'eps': 1e-05, 'affine': True}, 'dropout_op': None, 'dropout_op_kwargs': None, 'nonlin': 'torch.nn.LeakyReLU', 'nonlin_kwargs': {'inplace': True}, 'deep_supervision': True}, '_kw_requires_import': ['conv_op', 'norm_op', 'dropout_op', 'nonlin']}, 'batch_dice': False, 'next_stage': '3d_cascade_fullres'}

These are the global plan.json settings: {'dataset_name': 'Dataset001_segrap', 'plans_name': 'nnUNetPlans', 'original_median_spacing_after_transp': [3.0, 0.54296875, 0.54296875], 'original_median_shape_after_transp': [126, 1024, 1024], 'image_reader_writer': 'SimpleITKIO', 'transpose_forward': [0, 1, 2], 'transpose_backward': [0, 1, 2], 'experiment_planner_used': 'ExperimentPlanner', 'label_manager': 'LabelManager', 'foreground_intensity_properties_per_channel': {'0': {'max': 2682.0, 'mean': 3.85494327545166, 'median': 36.0, 'min': -1000.0, 'percentile_00_5': -968.0, 'percentile_99_5': 897.0, 'std': 310.99725341796875}}} Traceback (most recent call last): File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/resource_sharer.py", line 138, in _serve with self._listener.accept() as conn: File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 470, in accept deliver_challenge(c, self._authkey) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 745, in deliver_challenge response = connection.recv_bytes(256) # reject large message File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 221, in recv_bytes buf = self._recv_bytes(maxlength) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes buf = self._recv(4) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 388, in _recv raise EOFError EOFError Traceback (most recent call last):Traceback (most recent call last): File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/resource_sharer.py", line 138, in _serve with self._listener.accept() as conn: File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 470, in accept deliver_challenge(c, self._authkey) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 745, in deliver_challenge response = connection.recv_bytes(256) # reject large message File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 221, in recv_bytes buf = self._recv_bytes(maxlength) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes buf = self._recv(4) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 388, in _recv raise EOFError EOFError Traceback (most recent call last): File "/root/miniconda3/envs/wy/bin/nnUNetv2_train", line 8, in sys.exit(run_training_entry()) File "/root/nnUNet/nnunetv2/run/run_training.py", line 268, in run_training_entry run_training(args.dataset_name_or_id, args.configuration, args.fold, args.tr, args.p, args.pretrained_weights, File "/root/nnUNet/nnunetv2/run/run_training.py", line 208, in run_training nnunet_trainer.perform_actual_validation(export_validation_probabilities) File "/root/nnUNet/nnunetv2/training/nnUNetTrainer/nnUNetTrainer.py", line 1183, in perform_actual_validation proceed = not check_workers_alive_and_busy(segmentation_export_pool, worker_list, results, File "/root/nnUNet/nnunetv2/utilities/file_path_utilities.py", line 103, in check_workers_alive_and_busy raise RuntimeError('Some background workers are no longer alive') RuntimeError: Some background workers are no longer alive File "/root/miniconda3/envs/wy/bin/nnUNetv2_train", line 8, in sys.exit(run_training_entry()) File "/root/nnUNet/nnunetv2/run/run_training.py", line 268, in run_training_entry run_training(args.dataset_name_or_id, args.configuration, args.fold, args.tr, args.p, args.pretrained_weights, File "/root/nnUNet/nnunetv2/run/run_training.py", line 208, in run_training nnunet_trainer.perform_actual_validation(export_validation_probabilities) File "/root/nnUNet/nnunetv2/training/nnUNetTrainer/nnUNetTrainer.py", line 1183, in perform_actual_validation proceed = not check_workers_alive_and_busy(segmentation_export_pool, worker_list, results, File "/root/nnUNet/nnunetv2/utilities/file_path_utilities.py", line 103, in check_workers_alive_and_busy raise RuntimeError('Some background workers are no longer alive') RuntimeError: Some background workers are no longer alive

congcongwy51 commented 7 months ago

(wy) root@autodl-container-28da11ab52-7d6a74d6:~/nnUNet/nnunetv2# OMP_NUM_THREADS=1 nnUNetv2_train 1 3d_lowres 0 --npz Using device: cuda:0

####################################################################### Please cite the following paper when using nnU-Net: Isensee, F., Jaeger, P. F., Kohl, S. A., Petersen, J., & Maier-Hein, K. H. (2021). nnU-Net: a self-configuring method for deep learning-based biomedical image segmentation. Nature methods, 18(2), 203-211. #######################################################################

This is the configuration used by this training: Configuration name: 3d_lowres {'data_identifier': 'nnUNetPlans_3d_lowres', 'preprocessor_name': 'DefaultPreprocessor', 'batch_size': 2, 'patch_size': [64, 192, 192], 'median_image_size_in_voxels': [103, 296, 296], 'spacing': [3.6896215962746104, 1.879049723441106, 1.879049723441106], 'normalization_schemes': ['CTNormalization'], 'use_mask_for_norm': [False], 'resampling_fn_data': 'resample_data_or_seg_to_shape', 'resampling_fn_seg': 'resample_data_or_seg_to_shape', 'resampling_fn_data_kwargs': {'is_seg': False, 'order': 3, 'order_z': 0, 'force_separate_z': None}, 'resampling_fn_seg_kwargs': {'is_seg': True, 'order': 1, 'order_z': 0, 'force_separate_z': None}, 'resampling_fn_probabilities': 'resample_data_or_seg_to_shape', 'resampling_fn_probabilities_kwargs': {'is_seg': False, 'order': 1, 'order_z': 0, 'force_separate_z': None}, 'architecture': {'network_class_name': 'dynamic_network_architectures.architectures.unet.PlainConvUNet', 'arch_kwargs': {'n_stages': 6, 'features_per_stage': [32, 64, 128, 256, 320, 320], 'conv_op': 'torch.nn.modules.conv.Conv3d', 'kernel_sizes': [[3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3], [3, 3, 3]], 'strides': [[1, 1, 1], [2, 2, 2], [2, 2, 2], [2, 2, 2], [2, 2, 2], [1, 2, 2]], 'n_conv_per_stage': [2, 2, 2, 2, 2, 2], 'n_conv_per_stage_decoder': [2, 2, 2, 2, 2], 'conv_bias': True, 'norm_op': 'torch.nn.modules.instancenorm.InstanceNorm3d', 'norm_op_kwargs': {'eps': 1e-05, 'affine': True}, 'dropout_op': None, 'dropout_op_kwargs': None, 'nonlin': 'torch.nn.LeakyReLU', 'nonlin_kwargs': {'inplace': True}, 'deep_supervision': True}, '_kw_requires_import': ['conv_op', 'norm_op', 'dropout_op', 'nonlin']}, 'batch_dice': False, 'next_stage': '3d_cascade_fullres'}

These are the global plan.json settings: {'dataset_name': 'Dataset001_segrap', 'plans_name': 'nnUNetPlans', 'original_median_spacing_after_transp': [3.0, 0.54296875, 0.54296875], 'original_median_shape_after_transp': [126, 1024, 1024], 'image_reader_writer': 'SimpleITKIO', 'transpose_forward': [0, 1, 2], 'transpose_backward': [0, 1, 2], 'experiment_planner_used': 'ExperimentPlanner', 'label_manager': 'LabelManager', 'foreground_intensity_properties_per_channel': {'0': {'max': 2682.0, 'mean': 3.85494327545166, 'median': 36.0, 'min': -1000.0, 'percentile_00_5': -968.0, 'percentile_99_5': 897.0, 'std': 310.99725341796875}}}

2024-03-25 16:07:54.756058: unpacking dataset... 2024-03-25 16:07:58.575657: unpacking done... 2024-03-25 16:07:58.581875: do_dummy_2d_data_aug: False 2024-03-25 16:07:58.588931: Using splits from existing split file: /root/autodl-fs/DATASET/nnUNet_preprocessed/Dataset001_segrap/splits_final.json 2024-03-25 16:07:58.609387: The split file contains 5 splits. 2024-03-25 16:07:58.612865: Desired fold for training: 0 2024-03-25 16:07:58.616118: This split has 67 training and 17 validation cases. 2024-03-25 16:07:58.739364: Unable to plot network architecture: 2024-03-25 16:07:58.742365: No module named 'IPython' 2024-03-25 16:07:58.806226: 2024-03-25 16:07:58.809519: Epoch 0 2024-03-25 16:07:58.813372: Current learning rate: 0.01 using pin_memory on device 0 using pin_memory on device 0 2024-03-25 17:31:47.676724: segrap_0013, shape torch.Size([1, 103, 276, 276]), rank 0 Traceback (most recent call last): File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/resource_sharer.py", line 138, in _serve with self._listener.accept() as conn: File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 470, in accept deliver_challenge(c, self._authkey) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 745, in deliver_challenge response = connection.recv_bytes(256) # reject large message File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 221, in recv_bytes buf = self._recv_bytes(maxlength) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 419, in _recv_bytes buf = self._recv(4) File "/root/miniconda3/envs/wy/lib/python3.9/multiprocessing/connection.py", line 388, in _recv raise EOFError EOFError Traceback (most recent call last): File "/root/miniconda3/envs/wy/bin/nnUNetv2_train", line 8, in sys.exit(run_training_entry()) File "/root/nnUNet/nnunetv2/run/run_training.py", line 268, in run_training_entry run_training(args.dataset_name_or_id, args.configuration, args.fold, args.tr, args.p, args.pretrained_weights, File "/root/nnUNet/nnunetv2/run/run_training.py", line 208, in run_training nnunet_trainer.perform_actual_validation(export_validation_probabilities) File "/root/nnUNet/nnunetv2/training/nnUNetTrainer/nnUNetTrainer.py", line 1183, in perform_actual_validation proceed = not check_workers_alive_and_busy(segmentation_export_pool, worker_list, results, File "/root/nnUNet/nnunetv2/utilities/file_path_utilities.py", line 103, in check_workers_alive_and_busy raise RuntimeError('Some background workers are no longer alive') RuntimeError: Some background workers are no longer alive