fudan-zvg / SETR

[CVPR 2021] Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers
MIT License
1.05k stars 150 forks source link

RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED #57

Open yl5y18 opened 1 year ago

yl5y18 commented 1 year ago

I want to train my dataset with SETR-Naive. I have organized the structure of the dataset according to the ADE20K hierarchy. Then execute './tools/dist_train. sh configs/SETR/SETR-Naive_512x512_160k_ade20k_bs_16. py 1'. I copied the config file "SETR-Naive_512x512_160k_ade20k_bs_16. py" that generated in "work_dirs" to the "configs/SETR" folder and renamed it "my_SETR-Naive_512x5112_160k_ade20k_BHkidney_bs_16. py", and modified the configuration file as follows:

norm_cfg = dict(type='SyncBN', requires_grad=True) model = dict( type='EncoderDecoder', backbone=dict( type='VisionTransformer', model_name='vit_large_patch16_384', img_size=512, patch_size=16, in_chans=3, embed_dim=1024, depth=24, num_heads=16, num_classes=2, # I have modified the number of all classes drop_rate=0.0, norm_cfg=dict(type='SyncBN', requires_grad=True), pos_embed_interp=True, align_corners=False), decode_head=dict( type='VisionTransformerUpHead', in_channels=1024, channels=512, in_index=23, img_size=512, embed_dim=1024, num_classes=2, # I have modified the number of all classes norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), conv3x3_conv1x1=False), auxiliary_head=[ dict( type='VisionTransformerUpHead', in_channels=1024, channels=512, in_index=9, img_size=512, embed_dim=1024, num_classes=2, # I have modified the number of all classes norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, conv3x3_conv1x1=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), dict( type='VisionTransformerUpHead', in_channels=1024, channels=512, in_index=14, img_size=512, embed_dim=1024, num_classes=2, # I have modified the number of all classes norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, conv3x3_conv1x1=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)), dict( type='VisionTransformerUpHead', in_channels=1024, channels=512, in_index=19, img_size=512, embed_dim=1024, num_classes=2, # I have modified the number of all classes norm_cfg=dict(type='SyncBN', requires_grad=True), num_conv=2, upsampling_method='bilinear', align_corners=False, conv3x3_conv1x1=False, loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)) ]) train_cfg = dict() test_cfg = dict(mode='slide', crop_size=(512, 512), stride=(341, 341)) dataset_type = 'ADE20KDataset' data_root = '/data/SETR/data/BHMaskKidney_ideal' # I modify the dataset path img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) crop_size = (512, 512) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75), dict(type='RandomFlip', flip_ratio=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ] data = dict( samples_per_gpu=1, # batch size workers_per_gpu=0, # nums gpu train=dict( type='ADE20KDataset', data_root='/data/SETR/data/BHMaskKidney_ideal', img_dir='images/training', ann_dir='annotations/training', pipeline=[ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', reduce_zero_label=True), dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)), dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=0.75), dict(type='RandomFlip', flip_ratio=0.5), dict(type='PhotoMetricDistortion'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_semantic_seg']) ]), val=dict( type='ADE20KDataset', data_root='/data/SETR/data/BHMaskKidney_ideal', img_dir='images/validation', ann_dir='annotations/validation', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ]), test=dict( type='ADE20KDataset', data_root='/data/SETR/data/BHMaskKidney_ideal', img_dir='images/validation', ann_dir='annotations/validation', pipeline=[ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(2048, 512), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict( type='Normalize', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']) ]) ])) log_config = dict( interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)]) dist_params = dict(backend='nccl') log_level = 'INFO' load_from = None resume_from = None workflow = [('train', 1)] cudnn_benchmark = True optimizer = dict( type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0, paramwise_cfg=dict(custom_keys=dict(head=dict(lr_mult=10.0)))) optimizer_config = dict() lr_config = dict(policy='poly', power=0.9, min_lr=0.0001, by_epoch=False) total_iters = 160 checkpoint_config = dict(by_epoch=False, interval=50) evaluation = dict(interval=50, metric='mIoU') find_unused_parameters = True work_dir = './work_dirs/my_SETR_Naive_512x512_160k_ade20k_BHkidney_bs_16' gpu_ids = range(0, 1)

Then I executed './tools/dist_train. sh configs/SETR/my_SETR-Naive_512x512_160k_ade20k_bs_16. py 1' again The error reported is as follows: Traceback (most recent call last): File "./tools/train.py", line 195, in main() File "./tools/train.py", line 191, in main meta=meta) File "/data/SETR/mmseg/apis/train.py", line 116, in train_segmentor runner.run(data_loaders, cfg.workflow, cfg.total_iters) File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run iter_runner(iter_loaders[i], kwargs) File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/iter_based_runner.py", line 60, in train outputs = self.model.train_step(data_batch, self.optimizer, kwargs) File "/opt/conda/lib/python3.7/site-packages/mmcv/parallel/distributed.py", line 54, in train_step output = self.module.train_step(inputs[0], kwargs[0]) File "/data/SETR/mmseg/models/segmentors/base.py", line 152, in train_step losses = self(data_batch) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/opt/conda/lib/python3.7/site-packages/mmcv/runner/fp16_utils.py", line 84, in new_func return old_func(args, kwargs) File "/data/SETR/mmseg/models/segmentors/base.py", line 122, in forward return self.forward_train(img, img_metas, kwargs) File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 158, in forward_train gt_semantic_seg) File "/data/SETR/mmseg/models/segmentors/encoder_decoder.py", line 102, in _decode_head_forward_train self.train_cfg) File "/data/SETR/mmseg/models/decode_heads/decode_head.py", line 185, in forward_train seg_logits = self.forward(inputs) File "/data/SETR/mmseg/models/decode_heads/vit_up_head.py", line 153, in forward x = self.syncbn_fc_0(x) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl result = self.forward(input, kwargs) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/batchnorm.py", line 501, in forward exponential_average_factor, self.eps) File "/opt/conda/lib/python3.7/site-packages/torch/nn/functional.py", line 2016, in batch_norm training, momentum, eps, torch.backends.cudnn.enabled RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED Traceback (most recent call last): File "/opt/conda/lib/python3.7/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/opt/conda/lib/python3.7/runpy.py", line 85, in _run_code exec(code, run_globals) File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 261, in main() File "/opt/conda/lib/python3.7/site-packages/torch/distributed/launch.py", line 257, in main cmd=cmd) subprocess.CalledProcessError: Command '['/opt/conda/bin/python', '-u', './tools/train.py', '--local_rank=0', 'configs/SETR/my_SETR_Naive_512x512_160k_ade20k_BHkidney_bs_16.py', '--launcher', 'pytorch']' returned non-zero exit status 1. root@ipfsunion-Server:/data/SETR# Traceback (most recent call last): File "./tools/train.py", line 195, in main() File "./tools/train.py", line 191, in main meta=meta) File "/data/SETR/mmseg/apis/train.py", line 69, in train_segmentor model.cuda(), File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in cuda return self._apply(lambda t: t.cuda(device)) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply module._apply(fn) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply module._apply(fn) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 354, in _apply module._apply(fn) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 376, in _apply param_applied = fn(param) File "/opt/conda/lib/python3.7/site-packages/torch/nn/modules/module.py", line 458, in return self._apply(lambda t: t.cuda(device)) KeyboardInterrupt Traceback (most recent call last): File "./tools/train.py", line 195, in main() File "./tools/train.py", line 191, in main meta=meta) File "/data/SETR/mmseg/apis/train.py", line 72, in train_segmentor find_unused_parameters=find_unused_parameters) File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 333, in init self.broadcast_bucket_size) File "/opt/conda/lib/python3.7/site-packages/torch/nn/parallel/distributed.py", line 549, in _distributed_broadcast_coalesced dist._broadcast_coalesced(self.process_group, tensors, buffer_size) RuntimeError: Broken pipe

My environment configuration is as follows: sys.platform: linux Python: 3.7.0 (default, Oct 9 2018, 10:31:47) [GCC 7.3.0] CUDA available: True GPU 0,1: NVIDIA RTX A6000 NVCC: Cuda compilation tools, release 10.1, V10.1.243 GCC: gcc (Ubuntu 7.4.0-1ubuntu1~18.04.1) 7.4.0 PyTorch: 1.6.0 CuDNN 7.6.3 MMCV: 1.2.7 MMCV Compiler: GCC 7.3 MMCV CUDA Compiler: 10.1 MMSegmentation: 0.6.0+

May I ask how to handle this error?