### when I try to train on multiple GPUs then this error occurred.
An exception has occurred: RuntimeError
Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, *kwargs)
File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/models/models.py", line 546, in forward
return self.forward_once(x)
File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/models/models.py", line 607, in forward_once
x = module(x)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(input, kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/container.py", line 119, in forward
input = module(input)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 399, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 396, in _conv_forward
self.padding, self.dilation, self.groups)
RuntimeError: cuDNN error: CUDNN_STATUS_BAD_PARAM
You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.
File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/train.py", line 296, in train
pred = model(imgs) # forward
File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/train.py", line 546, in
train(hyp, opt, device, tb_writer, wandb
### when I try to train on multiple GPUs then this error occurred.
An exception has occurred: RuntimeError Caught RuntimeError in replica 0 on device 0. Original Traceback (most recent call last): File "/usr/local/lib/python3.6/dist-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker output = module(*input, kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(*input, *kwargs) File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/models/models.py", line 546, in forward return self.forward_once(x) File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/models/models.py", line 607, in forward_once x = module(x) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(input, kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/container.py", line 119, in forward input = module(input) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl result = self.forward(*input, **kwargs) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 399, in forward return self._conv_forward(input, self.weight, self.bias) File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 396, in _conv_forward self.padding, self.dilation, self.groups) RuntimeError: cuDNN error: CUDNN_STATUS_BAD_PARAM You can try to repro this exception using the following code snippet. If that doesn't trigger the error, please include your original repro script when reporting this issue.
import torch torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.benchmark = True torch.backends.cudnn.deterministic = False torch.backends.cudnn.allow_tf32 = True data = torch.randn([8, 128, 160, 160], dtype=torch.half, device='cuda', requires_grad=True) net = torch.nn.Conv2d(128, 64, kernel_size=[1, 1], padding=[0, 0], stride=[1, 1], dilation=[1, 1], groups=1) net = net.cuda().half() out = net(data) out.backward(torch.randn_like(out)) torch.cuda.synchronize()
ConvolutionParams data_type = CUDNN_DATA_HALF padding = [0, 0, 0] stride = [1, 1, 0] dilation = [1, 1, 0] groups = 1 deterministic = false allow_tf32 = true input: TensorDescriptor 0x7ff3700ed890 type = CUDNN_DATA_HALF nbDims = 4 dimA = 8, 128, 160, 160, strideA = 3276800, 25600, 160, 1, output: TensorDescriptor 0x7ff3700ecec0 type = CUDNN_DATA_HALF nbDims = 4 dimA = 8, 64, 160, 160, strideA = 1638400, 25600, 160, 1, weight: FilterDescriptor 0x7ff370126320 type = CUDNN_DATA_HALF tensor_format = CUDNN_TENSOR_NCHW nbDims = 4 dimA = 64, 128, 1, 1, Pointer addresses: input: 0x7ff2f3a00000 output: 0x7ff399200000 weight: 0x7ff41e9efc00 Forward algorithm: 1
File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/train.py", line 296, in train pred = model(imgs) # forward File "/workspaces/python/DeepDataPrepartionNTrainingFramework/ObjectDetection/yolor/train.py", line 546, in
train(hyp, opt, device, tb_writer, wandb