I'm trying to train a segmentation model using DeepLabV3+ and Resnet as backbone using fastai2 on distributed GPUs (2 GPUS, same machine).
However, when I launch training process, I'm getting the error stack message below:
File "train_bscnn.py", line 131, in run
with learn.distrib_ctx():learn.fit_one_cycle(epochs, slice_train, cbs=callbacks)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/callback/schedule.py", line 116, in fit_one_cycle
self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 221, in fit
self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents
try: self(f'before{event_type}'); f()
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 212, in _do_fit
self._with_events(self._do_epoch, 'epoch', CancelEpochException)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents
try: self(f'before{event_type}'); f()
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 206, in _do_epoch
self._do_epoch_train()
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 198, in _do_epoch_train
self._with_events(self.all_batches, 'train', CancelTrainException)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents
try: self(f'before{event_type}'); f()
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 169, in all_batches
for o in enumerate(self.dl): self.one_batch(o)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 194, in one_batch
self._with_events(self._do_one_batch, 'batch', CancelBatchException)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents
try: self(f'before{event_type}'); f()
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 172, in _do_one_batch
self.pred = self.model(self.xb)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward
output = self.module(*inputs[0], *kwargs[0])
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/segmentation_models_pytorch/base/model.py", line 15, in forward
features = self.encoder(x)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, kwargs)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/segmentation_models_pytorch/encoders/resnet.py", line 62, in forward
x = stagesi
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, *kwargs)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/container.py", line 141, in forward
input = module(input)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(input, kwargs)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/activation.py", line 98, in forward
return F.relu(input, inplace=self.inplace)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/functional.py", line 1438, in relu
return handle_torch_function(relu, (input,), input, inplace=inplace)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/overrides.py", line 1394, in handle_torch_function
result = torch_func_method(public_api, types, args, kwargs)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/torch_core.py", line 341, in torch_function
res = super().torch_function(func, types, args=args, kwargs=kwargs)
File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/_tensor.py", line 1142, in __torch_function__
ret = func(*args, **kwargs)
File "/home/tiago/anaconda3/envs/fastai2segmentation/lib/python3.8/site-packages/torch/nn/functional.py", line 1440, in relu
result = torch.relu(input)
RuntimeError: Output 0 of SyncBatchNormBackward is a view and is being modified inplace. This view was created inside a custom Function (or because an input was returned as-is) and the autograd logic to handle view+inplace would override the custom backward associated with the custom Function, leading to incorrect gradients. This behavior is forbidden. You can fix this by cloning the output of the custom Function.
I'm trying to train a segmentation model using DeepLabV3+ and Resnet as backbone using fastai2 on distributed GPUs (2 GPUS, same machine).
However, when I launch training process, I'm getting the error stack message below:
File "train_bscnn.py", line 131, in run with learn.distrib_ctx():learn.fit_one_cycle(epochs, slice_train, cbs=callbacks) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/callback/schedule.py", line 116, in fit_one_cycle self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 221, in fit self._with_events(self._do_fit, 'fit', CancelFitException, self._end_cleanup) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents try: self(f'before{event_type}'); f() File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 212, in _do_fit self._with_events(self._do_epoch, 'epoch', CancelEpochException) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents try: self(f'before{event_type}'); f() File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 206, in _do_epoch self._do_epoch_train() File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 198, in _do_epoch_train self._with_events(self.all_batches, 'train', CancelTrainException) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents try: self(f'before{event_type}'); f() File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 169, in all_batches for o in enumerate(self.dl): self.one_batch(o) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 194, in one_batch self._with_events(self._do_one_batch, 'batch', CancelBatchException) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 163, in _withevents try: self(f'before{event_type}'); f() File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/learner.py", line 172, in _do_one_batch self.pred = self.model(self.xb) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, kwargs) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 963, in forward output = self.module(*inputs[0], *kwargs[0]) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(input, kwargs) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/segmentation_models_pytorch/base/model.py", line 15, in forward features = self.encoder(x) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, kwargs) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/segmentation_models_pytorch/encoders/resnet.py", line 62, in forward x = stagesi File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, *kwargs) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/container.py", line 141, in forward input = module(input) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(input, kwargs) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/modules/activation.py", line 98, in forward return F.relu(input, inplace=self.inplace) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/nn/functional.py", line 1438, in relu return handle_torch_function(relu, (input,), input, inplace=inplace) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/overrides.py", line 1394, in handle_torch_function result = torch_func_method(public_api, types, args, kwargs) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/fastai/torch_core.py", line 341, in torch_function res = super().torch_function(func, types, args=args, kwargs=kwargs) File "/home/tiago/anaconda3/envs/fastai2_segmentation/lib/python3.8/site-packages/torch/_tensor.py", line 1142, in __torch_function__ ret = func(*args, **kwargs) File "/home/tiago/anaconda3/envs/fastai2segmentation/lib/python3.8/site-packages/torch/nn/functional.py", line 1440, in relu result = torch.relu(input) RuntimeError: Output 0 of SyncBatchNormBackward is a view and is being modified inplace. This view was created inside a custom Function (or because an input was returned as-is) and the autograd logic to handle view+inplace would override the custom backward associated with the custom Function, leading to incorrect gradients. This behavior is forbidden. You can fix this by cloning the output of the custom Function.
Any thoughts ?