Hi, Jian Zou,
Thanks for your work. I run the pretrain script and the error occurs.
log:
drop_info is set to {0: {'max_tokens': 30, 'drop_range': (0, 30)}, 1: {'max_tokens': 60, 'drop_range': (30, 60)}, 2: {'max_tokens': 100, 'drop_range': (60, 100)}, 3: {'max_tokens': 200, 'drop_range': (100, 200)}, 4: {'max_tokens': 256, 'drop_range': (200, 100000)}}, in input_layer
/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3609: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
warnings.warn(
/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3657: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details.
warnings.warn(
/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3609: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details.
warnings.warn(
/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3657: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details.
warnings.warn(
[W reducer.cpp:283] Warning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance.
grad.sizes() = [256, 768, 1, 1], strides() = [768, 1, 768, 768]
bucket_view.sizes() = [256, 768, 1, 1], strides() = [768, 1, 1, 1] (function operator())
[W reducer.cpp:283] Warning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance.
grad.sizes() = [256, 768, 1, 1], strides() = [768, 1, 768, 768]
bucket_view.sizes() = [256, 768, 1, 1], strides() = [768, 1, 1, 1] (function operator())
/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py:31: FutureWarning: Non-finite norm encountered in torch.nn.utils.clip_gradnorm; continuing anyway. Note that the default behavior will change in a future release to error out if a non-finite total norm is encountered. At that point, setting error_if_nonfinite=false will be required to retain the old behavior.
return clip_grad.clip_gradnorm(params, self.grad_clip)
/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py:31: FutureWarning: Non-finite norm encountered in torch.nn.utils.clip_gradnorm; continuing anyway. Note that the default behavior will change in a future release to error out if a non-finite total norm is encountered. At that point, setting error_if_nonfinite=false will be required to retain the old behavior.
return clip_grad.clip_gradnorm(params, self.grad_clip)
2023-10-04 21:04:22,689 - mmcv - INFO - Reducer buckets have been rebuilt in this iteration.
Traceback (most recent call last):
File "tools/train.py", line 224, in
main()
File "tools/train.py", line 214, in main
train_model(
File "/home/liziyi/UniM2AE/Pretrain/mmdet3d/apis/train.py", line 11, in train_model
train_detector(
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmdet/apis/train.py", line 170, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], *kwargs)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/epoch_based_runner.py", line 51, in train
self.call_hook('after_train_iter')
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py", line 224, in after_train_iter
self.loss_scaler.scale(runner.outputs['loss']).backward()
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/autograd/init.py", line 147, in backward
Variable._execution_engine.run_backward(
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/autograd/function.py", line 87, in apply
return self._forward_cls.backward(self, args) # type: ignore[attr-defined]
File "/home/liziyi/UniM2AE/Pretrain/mmdet3d/ops/voxel/scatter_points.py", line 43, in backward
dynamic_point_to_voxel_backward(grad_feats,
RuntimeError: CUDA error: an illegal memory access was encountered
Hi, Jian Zou, Thanks for your work. I run the pretrain script and the error occurs.
log:
drop_info is set to {0: {'max_tokens': 30, 'drop_range': (0, 30)}, 1: {'max_tokens': 60, 'drop_range': (30, 60)}, 2: {'max_tokens': 100, 'drop_range': (60, 100)}, 3: {'max_tokens': 200, 'drop_range': (100, 200)}, 4: {'max_tokens': 256, 'drop_range': (200, 100000)}}, in input_layer /home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3609: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. warnings.warn( /home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3657: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. warnings.warn( /home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3609: UserWarning: Default upsampling behavior when mode=bicubic is changed to align_corners=False since 0.4.0. Please specify align_corners=True if the old behavior is desired. See the documentation of nn.Upsample for details. warnings.warn( /home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/nn/functional.py:3657: UserWarning: The default behavior for interpolate/upsample with float scale_factor changed in 1.6.0 to align with other frameworks/libraries, and now uses scale_factor directly, instead of relying on the computed output size. If you wish to restore the old behavior, please set recompute_scale_factor=True. See the documentation of nn.Upsample for details. warnings.warn( [W reducer.cpp:283] Warning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance. grad.sizes() = [256, 768, 1, 1], strides() = [768, 1, 768, 768] bucket_view.sizes() = [256, 768, 1, 1], strides() = [768, 1, 1, 1] (function operator()) [W reducer.cpp:283] Warning: Grad strides do not match bucket view strides. This may indicate grad was not created according to the gradient layout contract, or that the param's strides changed since DDP was constructed. This is not an error, but may impair performance. grad.sizes() = [256, 768, 1, 1], strides() = [768, 1, 768, 768] bucket_view.sizes() = [256, 768, 1, 1], strides() = [768, 1, 1, 1] (function operator()) /home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py:31: FutureWarning: Non-finite norm encountered in torch.nn.utils.clip_gradnorm; continuing anyway. Note that the default behavior will change in a future release to error out if a non-finite total norm is encountered. At that point, setting error_if_nonfinite=false will be required to retain the old behavior. return clip_grad.clip_gradnorm(params, self.grad_clip) /home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py:31: FutureWarning: Non-finite norm encountered in torch.nn.utils.clip_gradnorm; continuing anyway. Note that the default behavior will change in a future release to error out if a non-finite total norm is encountered. At that point, setting error_if_nonfinite=false will be required to retain the old behavior. return clip_grad.clip_gradnorm(params, self.grad_clip) 2023-10-04 21:04:22,689 - mmcv - INFO - Reducer buckets have been rebuilt in this iteration. Traceback (most recent call last): File "tools/train.py", line 224, in
main()
File "tools/train.py", line 214, in main
train_model(
File "/home/liziyi/UniM2AE/Pretrain/mmdet3d/apis/train.py", line 11, in train_model
train_detector(
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmdet/apis/train.py", line 170, in train_detector
runner.run(data_loaders, cfg.workflow)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/epoch_based_runner.py", line 127, in run
epoch_runner(data_loaders[i], *kwargs)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/epoch_based_runner.py", line 51, in train
self.call_hook('after_train_iter')
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/mmcv/runner/hooks/optimizer.py", line 224, in after_train_iter
self.loss_scaler.scale(runner.outputs['loss']).backward()
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/autograd/init.py", line 147, in backward
Variable._execution_engine.run_backward(
File "/home/liziyi/.conda/envs/pimae/lib/python3.8/site-packages/torch/autograd/function.py", line 87, in apply
return self._forward_cls.backward(self, args) # type: ignore[attr-defined]
File "/home/liziyi/UniM2AE/Pretrain/mmdet3d/ops/voxel/scatter_points.py", line 43, in backward
dynamic_point_to_voxel_backward(grad_feats,
RuntimeError: CUDA error: an illegal memory access was encountered