[Bug] CUDA Error: illegal memory access

Prerequisite

[X] I have searched Issues and Discussions but cannot get the expected help.
[X] The bug has not been fixed in the latest version(https://github.com/open-mmlab/mmcv).

Environment

install pytorch, by using conda. But when I used pip, same error is occured

{'sys.platform': 'linux', 'Python': '3.8.19 (default, Mar 20 2024, 19:58:24) [GCC 11.2.0]', 'CUDA available': True, 'GPU 0,1,2,3,4,5,6,7': 'NVIDIA A100 80GB PCIe', 'CUDA_HOME': '/usr/local/cuda-11.3', 'NVCC': 'Build cuda_11.3.r11.3/compiler.29920130_0', 'GCC': 'gcc (GCC) 6.1.0', 'PyTorch': '1.10.1', 'PyTorch compiling details': 'PyTorch built with:\n - GCC 7.3\n - C++ Version: 201402\n - Intel(R) oneAPI Math Kernel Library Version 2023.1-Product Build 20230303 for Intel(R) 64 architecture applications\n - Intel(R) MKL-DNN v2.2.3 (Git Hash 7336ca9f055cf1bfa13efb658fe15dc9b41f0740)\n - OpenMP 201511 (a.k.a. OpenMP 4.5)\n - LAPACK is enabled (usually provided by MKL)\n - NNPACK is enabled\n - CPU capability usage: AVX2\n - CUDA Runtime 11.3\n - NVCC architecture flags: -gencode;arch=compute_37,code=sm_37;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_37,code=compute_37\n - CuDNN 8.2\n - Magma 2.5.2\n - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=11.3, CUDNN_VERSION=8.2.0, CXX_COMPILER=/opt/rh/devtoolset-7/root/usr/bin/c++, CXX_FLAGS= -Wno-deprecated -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -fopenmp -DNDEBUG -DUSE_KINETO -DUSE_FBGEMM -DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DEDGE_PROFILER_USE_KINETO -O2 -fPIC -Wno-narrowing -Wall -Wextra -Werror=return-type -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-sign-compare -Wno-unused-parameter -Wno-unused-variable -Wno-unused-function -Wno-unused-result -Wno-unused-local-typedefs -Wno-strict-overflow -Wno-strict-aliasing -Wno-error=deprecated-declarations -Wno-stringop-overflow -Wno-psabi -Wno-error=pedantic -Wno-error=redundant-decls -Wno-error=old-style-cast -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=1.10.1, USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, USE_NNPACK=ON, USE_OPENMP=ON, \n', 'TorchVision': '0.11.2', 'OpenCV': '4.9.0', 'MMCV': '1.4.0', 'MMCV Compiler': 'GCC 7.3', 'MMCV CUDA Compiler': '11.3'}

Reproduces the problem - code sample

work_dir = 'result/voxformer-T'
_base_ = [
    '../_base_/default_runtime.py'
]
plugin = True
plugin_dir = 'projects/mmdet3d_plugin/'

_num_layers_cross_ = 3
_num_points_cross_ = 8
_num_layers_self_ = 2
_num_points_self_ = 8
_dim_ = 128
_pos_dim_ = _dim_//2
_ffn_dim_ = _dim_*2
_num_levels_ = 1

_labels_tag_ = 'labels'
_num_cams_ = 5
_temporal_ = [-12,-9,-6,-3]
point_cloud_range = [0, -25.6, -2.0, 51.2, 25.6, 4.4]
voxel_size = [0.2, 0.2, 0.2]

_sem_scal_loss_ = True
_geo_scal_loss_ = True
_depthmodel_= 'msnet3d'
_nsweep_ = 10
_query_tag_ = 'query_iou5203_pre7712_rec6153'

model = dict(
   type='VoxFormer',
   pretrained=dict(img='ckpts/resnet50-19c8e357.pth'),
   img_backbone=dict(
       type='ResNet',
       depth=50,
       num_stages=4,
       out_indices=(2,),
       frozen_stages=1,
       norm_cfg=dict(type='BN', requires_grad=False),
       norm_eval=True,
       style='pytorch'),
   img_neck=dict(
       type='FPN',
       in_channels=[1024],
       out_channels=_dim_,
       start_level=0,
       add_extra_convs='on_output',
       num_outs=_num_levels_,
       relu_before_extra_convs=True),
   pts_bbox_head=dict(
       type='VoxFormerHead',
       bev_h=128,
       bev_w=128,
       bev_z=16,
       embed_dims=_dim_,
       CE_ssc_loss=True,
       geo_scal_loss=_geo_scal_loss_,
       sem_scal_loss=_sem_scal_loss_,
       cross_transformer=dict(
           type='PerceptionTransformer',
           rotate_prev_bev=True,
           use_shift=True,
           embed_dims=_dim_,
           num_cams = _num_cams_,
           encoder=dict(
               type='VoxFormerEncoder',
               num_layers=_num_layers_cross_,
               pc_range=point_cloud_range,
               num_points_in_pillar=8,
               return_intermediate=False,
               transformerlayers=dict(
                   type='VoxFormerLayer',
                   attn_cfgs=[
                       dict(
                           type='DeformCrossAttention',
                           pc_range=point_cloud_range,
                           num_cams=_num_cams_,
                           deformable_attention=dict(
                               type='MSDeformableAttention3D',
                               embed_dims=_dim_,
                               num_points=_num_points_cross_,
                               num_levels=_num_levels_),
                           embed_dims=_dim_,
                       )
                   ],
                   ffn_cfgs=dict(
                       type='FFN',
                       embed_dims=_dim_,
                       feedforward_channels=1024,
                       num_fcs=2,
                       ffn_drop=0.,
                       act_cfg=dict(type='ReLU', inplace=True),
                   ),
                   feedforward_channels=_ffn_dim_,
                   ffn_dropout=0.1,
                   operation_order=('cross_attn', 'norm', 'ffn', 'norm')))),
       self_transformer=dict(
           type='PerceptionTransformer',
           rotate_prev_bev=True,
           use_shift=True,
           embed_dims=_dim_,
           num_cams = _num_cams_,
           encoder=dict(
               type='VoxFormerEncoder',
               num_layers=_num_layers_self_,
               pc_range=point_cloud_range,
               num_points_in_pillar=8,
               return_intermediate=False,
               transformerlayers=dict(
                   type='VoxFormerLayer',
                   attn_cfgs=[
                       dict(
                           type='DeformSelfAttention',
                           embed_dims=_dim_,
                           num_levels=1,
                           num_points=_num_points_self_)
                   ],
                   ffn_cfgs=dict(
                       type='FFN',
                       embed_dims=_dim_,
                       feedforward_channels=1024,
                       num_fcs=2,
                       ffn_drop=0.,
                       act_cfg=dict(type='ReLU', inplace=True),
                   ),
                   feedforward_channels=_ffn_dim_,
                   ffn_dropout=0.1,
                   operation_order=('self_attn', 'norm', 'ffn', 'norm')))),
       positional_encoding=dict(
           type='LearnedPositionalEncoding',
           num_feats=_pos_dim_,
           row_num_embed=512,
           col_num_embed=512,
           )),
   train_cfg=dict(pts=dict(
       grid_size=[512, 512, 1],
       voxel_size=voxel_size,
       point_cloud_range=point_cloud_range,
       out_size_factor=4)))

dataset_type = 'SemanticKittiDatasetStage2'
data_root = './kitti/'
file_client_args = dict(backend='disk')

data = dict(
   samples_per_gpu=1,
   workers_per_gpu=4,
   train=dict(
       type=dataset_type,
       split = "train",
       test_mode=False,
       data_root=data_root,
       preprocess_root=data_root + 'dataset',
       eval_range = 51.2,
       depthmodel=_depthmodel_,
       nsweep=_nsweep_,
       temporal = _temporal_,
       labels_tag = _labels_tag_,
       query_tag = _query_tag_),
   val=dict(
       type=dataset_type,
       split = "val",
       test_mode=True,
       data_root=data_root,
       preprocess_root=data_root + 'dataset',
       eval_range = 51.2,
       depthmodel=_depthmodel_,
       nsweep=_nsweep_,
       temporal = _temporal_,
       labels_tag = _labels_tag_,
       query_tag = _query_tag_),
   test=dict(
       type=dataset_type,
       split = "val",
       test_mode=True,
       data_root=data_root,
       preprocess_root=data_root + 'dataset',
       eval_range = 51.2,
       depthmodel=_depthmodel_,
       nsweep=_nsweep_,
       temporal = _temporal_,
       labels_tag = _labels_tag_,
       query_tag = _query_tag_),
   shuffler_sampler=dict(type='DistributedGroupSampler'),
   nonshuffler_sampler=dict(type='DistributedSampler')
)
optimizer = dict(
   type='AdamW',
   lr=2e-4,
   weight_decay=0.01)

optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
   policy='CosineAnnealing',
   warmup='linear',
   warmup_iters=500,
   warmup_ratio=1.0 / 3,
   min_lr_ratio=1e-3)
total_epochs = 20
evaluation = dict(interval=1)

runner = dict(type='EpochBasedRunner', max_epochs=total_epochs)
log_config = dict(
   interval=50,
   hooks=[
       dict(type='TextLoggerHook'),
       dict(type='TensorboardLoggerHook')
   ])

# checkpoint_config = None
checkpoint_config = dict(interval=2)

Reproduces the problem - command or script

#!/usr/bin/env bash

CONFIG=$1
GPUS=$2
PORT=${PORT:-28509}

PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
CUDA_VISIBLE_DEVICES=4,5,6,7 python -m torch.distributed.launch --nproc_per_node=$GPUS --master_port=$PORT \
    $(dirname "$0")/train.py $CONFIG --launcher pytorch ${@:3} --deterministic

Reproduces the problem - error message

Hello , Now I am reproducing Your fantastic VoxFormer
But, when stage-1 study is well done, stage-2 make error like below

my compute spec is
Tesla A100(80G) 4 GPUS
CUDA 11.3 torch 1.10.1

Please Tell me solution... I can't do anything now...

Error Message
error in ms_deformable_im2col_cuda: an illegal memory access was encountered
Traceback (most recent call last):
File "./tools/train.py", line 261, in
main()
File "./tools/train.py", line 250, in main
custom_train_model(
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/apis/train.py", line 27, in custom_train_model
custom_train_detector(
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/apis/mmdet_train.py", line 200, in custom_train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], **kwargs)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/runner/epoch_based_runner.py", line 50, in train
self.run_iter(data_batch, train_mode=True, **kwargs)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/runner/epoch_based_runner.py", line 29, in run_iter
outputs = self.model.train_step(data_batch, self.optimizer,
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/parallel/distributed.py", line 52, in train_step
output = self.module.train_step(*inputs[0], **kwargs[0])
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmdet/models/detectors/base.py", line 237, in train_step
losses = self(**data)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/detectors/voxformer.py", line 108, in forward
return self.forward_train(**kwargs)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 98, in new_func
return old_func(*args, **kwargs)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/detectors/voxformer.py", line 138, in forward_train
losses_pts = self.forward_pts_train(img_feats, img_metas, target)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/detectors/voxformer.py", line 93, in forward_pts_train
outs = self.pts_bbox_head(img_feats, img_metas, target)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/dense_heads/voxformer_head.py", line 95, in forward
seed_feats = self.cross_transformer.get_vox_features(
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 98, in new_func
return old_func(*args, **kwargs)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/modules/transformer.py", line 136, in get_vox_features
bev_embed = self.encoder(
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 98, in new_func
return old_func(*args, **kwargs)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/modules/encoder.py", line 205, in forward
output = layer(
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/modules/encoder.py", line 372, in forward
query = self.attentions[attn_index](
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/mmcv/runner/fp16_utils.py", line 186, in new_func
return old_func(*args, kwargs)
File "/home/hamyo/SSC/mmdetection3d/VoxFormer/projects/mmdet3d_plugin/voxformer/modules/deformable_cross_attention.py", line 171, in forward
slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)]
RuntimeError: CUDA error: an illegal memory access was encountered
terminate called after throwing an instance of 'c10::CUDAError'
what(): CUDA error: an illegal memory access was encountered
Exception raised from create_event_internal at /opt/conda/conda-bld/pytorch_1639180588308/work/c10/cuda/CUDACachingAllocator.cpp:1211 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fdede183d62 in /home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/lib/libc10.so)
frame https://github.com/NVlabs/VoxFormer/issues/1: + 0x1c613 (0x7fdf236a1613 in /home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame https://github.com/NVlabs/VoxFormer/issues/2: c10::cuda::CUDACachingAllocator::raw_delete(void) + 0x1a2 (0x7fdf236a2022 in /home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/lib/libc10_cuda.so)
frame https://github.com/NVlabs/VoxFormer/issues/3: c10::TensorImpl::release_resources() + 0xa4 (0x7fdede16d314 in /home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/lib/libc10.so)
frame https://github.com/NVlabs/VoxFormer/issues/4: + 0x295359 (0x7fdf776a3359 in /home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame https://github.com/NVlabs/VoxFormer/issues/5: + 0xadb231 (0x7fdf77ee9231 in /home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame https://github.com/NVlabs/VoxFormer/issues/6: THPVariable_subclass_dealloc(_object) + 0x292 (0x7fdf77ee9532 in /home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/lib/libtorch_python.so)
frame https://github.com/NVlabs/VoxFormer/issues/7: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4d386f]
frame https://github.com/NVlabs/VoxFormer/issues/8: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4e55cb]
frame https://github.com/NVlabs/VoxFormer/issues/9: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4e55cb]
frame https://github.com/NVlabs/VoxFormer/issues/10: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4e0800]
frame https://github.com/NVlabs/VoxFormer/issues/11: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f16a8]
frame https://github.com/NVlabs/VoxFormer/issues/12: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f1691]
frame https://github.com/NVlabs/VoxFormer/issues/13: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f1691]
frame https://github.com/NVlabs/VoxFormer/issues/14: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f1691]
frame https://github.com/NVlabs/VoxFormer/issues/15: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f1691]
frame https://github.com/NVlabs/VoxFormer/issues/16: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f1691]
frame https://github.com/NVlabs/VoxFormer/issues/17: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f1691]
frame https://github.com/NVlabs/VoxFormer/issues/18: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4f1691]
frame https://github.com/NVlabs/VoxFormer/issues/19: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x4c9280]
frame https://github.com/NVlabs/VoxFormer/issues/20: PyDict_SetItemString + 0x52 (0x5823a2 in /home/hamyo/anaconda3/envs/voxformer/bin/python)
frame https://github.com/NVlabs/VoxFormer/issues/21: PyImport_Cleanup + 0x93 (0x5a7623 in /home/hamyo/anaconda3/envs/voxformer/bin/python)
frame https://github.com/NVlabs/VoxFormer/issues/22: Py_FinalizeEx + 0x71 (0x5a6751 in /home/hamyo/anaconda3/envs/voxformer/bin/python)
frame https://github.com/NVlabs/VoxFormer/pull/23: Py_RunMain + 0x112 (0x5a21f2 in /home/hamyo/anaconda3/envs/voxformer/bin/python)
frame https://github.com/NVlabs/VoxFormer/issues/24: Py_BytesMain + 0x39 (0x57a799 in /home/hamyo/anaconda3/envs/voxformer/bin/python)
frame https://github.com/NVlabs/VoxFormer/issues/25: __libc_start_main + 0xe7 (0x7fdfb1900c87 in /lib/x86_64-linux-gnu/libc.so.6)
frame https://github.com/NVlabs/VoxFormer/issues/26: /home/hamyo/anaconda3/envs/voxformer/bin/python() [0x57a64d]

WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 53769 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 53771 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 53772 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 1 (pid: 53770) of binary: /home/hamyo/anaconda3/envs/voxformer/bin/python
Traceback (most recent call last):
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/distributed/launch.py", line 193, in
main()
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/distributed/run.py", line 710, in run
elastic_launch(
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 131, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/hamyo/anaconda3/envs/voxformer/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
./tools/train.py FAILED
Failures:
<NO_OTHER_FAILURES>
Root Cause (first observed failure):
[0]:
time : 2024-04-20_00:00:00
host : server-45
rank : 1 (local_rank: 1)
exitcode : -6 (pid: 53770)
error_file: <N/A>
traceback : Signal 6 (SIGABRT) received by PID 53770

Additional information

1.Now I reprouce VoxFormer and stage-1 is well done but, 2 is not After training it will make weight about model

SemanticKITTI
I think when i use deform attention it is occurred, but this is just my inference

open-mmlab / mmcv