MPS Issue : view size is not compatible with input tensor's size and stride

shanalikhan commented 3 weeks ago

🐛 Describe the bug

Hi, I'm trying to training F-RCNN based on coco dataset on my images. Image size is 512X512 I've tested dataloader separately and it works and prints the batch images and BB details Also i've tried to print the loss in NN and it does print the batch_mean as well and after that ERROR occurs.

This is the code

img_process = v2.Compose(
    [
        v2.ToTensor(),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ]
)

class SCocoDetection(datasets.CocoDetection):
    def __init__(
        self,
        image_directory_path: str,
        annotation_path : str,
        train: bool = True,
        image_processor = None
    ):
        super().__init__(image_directory_path, annotation_path)
        self.image_processor = image_processor

    def __getitem__(self, idx):

        image, annotations = super().__getitem__(idx)
        images, targets = [], []
        image_id = self.ids[idx]

        for ann in annotations:
            bbox = ann['bbox']

            small = (bbox[2] * bbox[3]) <= (512 * 512 * 0.001)

            bbox = torch.tensor(bbox).unsqueeze(0).float()
            boxes = ops.box_convert(bbox, in_fmt='xywh', out_fmt='xyxy')
            #boxes = None
            #if not small:
            #    boxes = ops.box_convert(bbox, in_fmt='xywh', out_fmt='xyxy')
            #else:
            #    boxes = bbox
            boxes = boxes.float()
            output_dict = self.image_processor({"image": image, "boxes": boxes})
            images.append(output_dict['image'])
            targets.append({
                'boxes': output_dict['boxes'],
                'labels': torch.ones(len(boxes), dtype=int)
            })

        return images, targets

TRAIN_DATASET = SCocoDetection(

    image_directory_path='047/v2_coco_train/images',
    annotation_path='047/v2_coco_train/result.json',
    image_processor=img_process,
    train=True)
VAL_DATASET = SCocoDetection(
    image_directory_path='047/v2_coco_test/images',
    annotation_path= '047/v2_coco_test/result.json',
    image_processor=img_process,
    train=False)

print("Number of training examples:", len(TRAIN_DATASET))
print("Number of validation examples:", len(VAL_DATASET))
#print("Number of test examples:", len(TEST_DATASET))

def collate_fn(batch):
    return tuple(zip(*batch))

TRAIN_DATALOADER = DataLoader(dataset=TRAIN_DATASET,collate_fn = collate_fn, batch_size=2, shuffle=True)
VAL_DATALOADER = DataLoader(dataset=VAL_DATASET,collate_fn = collate_fn, batch_size=4, shuffle=True)

import numpy as np

class CocoDNN(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = models.detection.fasterrcnn_mobilenet_v3_large_fpn(weights="DEFAULT")

    def forward(self, images, targets=None):
        return self.model(images, targets)

    def training_step(self, batch, batch_idx):
        imgs, annot = batch

        batch_losses = []
        for img_b, annot_b in zip(imgs, annot):
            #print(len(img_b), len(annot_b))
            loss_dict = self.model(img_b, annot_b)
            losses = sum(loss for loss in loss_dict.values())
            #print(losses)
            batch_losses.append(losses)

        batch_mean  = torch.mean(torch.stack(batch_losses))
        #print(batch_mean) --- **TRIED THIS AND IT PRINTS** 
        self.log('train_loss', batch_mean)

        #print(imgs[0])
        #print(' ----',annot)
        #loss_dict = self.model(img_b, annot_b)
        #losses = sum(loss for loss in loss_dict.values())
        #self.log('train_loss', losses)
        return batch_mean

    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=0.001, momentum=0.9, weight_decay=0.0005)

dnn = CocoDNN()
trainer = L.Trainer(limit_train_batches=100, max_epochs=1)
trainer.fit(model=dnn, train_dataloaders=TRAIN_DATALOADER)

Stack Trace


{
    "name": "RuntimeError",
    "message": "view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.",
    "stack": "---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[192], line 3
      1 dnn = CocoDNN()
      2 trainer = L.Trainer(limit_train_batches=100, max_epochs=1)
----> 3 trainer.fit(model=dnn, train_dataloaders=TRAIN_DATALOADER)

File site-packages/lightning/pytorch/trainer/trainer.py:538, in Trainer.fit(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    536 self.state.status = TrainerStatus.RUNNING
    537 self.training = True
--> 538 call._call_and_handle_interrupt(
    539     self, self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
    540 )

File site-packages/lightning/pytorch/trainer/call.py:47, in _call_and_handle_interrupt(trainer, trainer_fn, *args, **kwargs)
     45     if trainer.strategy.launcher is not None:
     46         return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
---> 47     return trainer_fn(*args, **kwargs)
     49 except _TunerExitException:
     50     _call_teardown_hook(trainer)

File site-packages/lightning/pytorch/trainer/trainer.py:574, in Trainer._fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
    567 assert self.state.fn is not None
    568 ckpt_path = self._checkpoint_connector._select_ckpt_path(
    569     self.state.fn,
    570     ckpt_path,
    571     model_provided=True,
    572     model_connected=self.lightning_module is not None,
    573 )
--> 574 self._run(model, ckpt_path=ckpt_path)
    576 assert self.state.stopped
    577 self.training = False

File site-packages/lightning/pytorch/trainer/trainer.py:981, in Trainer._run(self, model, ckpt_path)
    976 self._signal_connector.register_signal_handlers()
    978 # ----------------------------
    979 # RUN THE TRAINER
    980 # ----------------------------
--> 981 results = self._run_stage()
    983 # ----------------------------
    984 # POST-Training CLEAN UP
    985 # ----------------------------
    986 log.debug(f\"{self.__class__.__name__}: trainer tearing down\")

File site-packages/lightning/pytorch/trainer/trainer.py:1025, in Trainer._run_stage(self)
   1023         self._run_sanity_check()
   1024     with torch.autograd.set_detect_anomaly(self._detect_anomaly):
-> 1025         self.fit_loop.run()
   1026     return None
   1027 raise RuntimeError(f\"Unexpected state {self.state}\")

File site-packages/lightning/pytorch/loops/fit_loop.py:205, in _FitLoop.run(self)
    203 try:
    204     self.on_advance_start()
--> 205     self.advance()
    206     self.on_advance_end()
    207     self._restarting = False

File site-packages/lightning/pytorch/loops/fit_loop.py:363, in _FitLoop.advance(self)
    361 with self.trainer.profiler.profile(\"run_training_epoch\"):
    362     assert self._data_fetcher is not None
--> 363     self.epoch_loop.run(self._data_fetcher)

File site-packages/lightning/pytorch/loops/training_epoch_loop.py:140, in _TrainingEpochLoop.run(self, data_fetcher)
    138 while not self.done:
    139     try:
--> 140         self.advance(data_fetcher)
    141         self.on_advance_end(data_fetcher)
    142         self._restarting = False

File site-packages/lightning/pytorch/loops/training_epoch_loop.py:250, in _TrainingEpochLoop.advance(self, data_fetcher)
    247 with trainer.profiler.profile(\"run_training_batch\"):
    248     if trainer.lightning_module.automatic_optimization:
    249         # in automatic optimization, there can only be one optimizer
--> 250         batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
    251     else:
    252         batch_output = self.manual_optimization.run(kwargs)

File site-packages/lightning/pytorch/loops/optimization/automatic.py:190, in _AutomaticOptimization.run(self, optimizer, batch_idx, kwargs)
    183         closure()
    185 # ------------------------------
    186 # BACKWARD PASS
    187 # ------------------------------
    188 # gradient update with accumulated gradients
    189 else:
--> 190     self._optimizer_step(batch_idx, closure)
    192 result = closure.consume_result()
    193 if result.loss is None:

File site-packages/lightning/pytorch/loops/optimization/automatic.py:268, in _AutomaticOptimization._optimizer_step(self, batch_idx, train_step_and_backward_closure)
    265     self.optim_progress.optimizer.step.increment_ready()
    267 # model hook
--> 268 call._call_lightning_module_hook(
    269     trainer,
    270     \"optimizer_step\",
    271     trainer.current_epoch,
    272     batch_idx,
    273     optimizer,
    274     train_step_and_backward_closure,
    275 )
    277 if not should_accumulate:
    278     self.optim_progress.optimizer.step.increment_completed()

File site-packages/lightning/pytorch/trainer/call.py:167, in _call_lightning_module_hook(trainer, hook_name, pl_module, *args, **kwargs)
    164 pl_module._current_fx_name = hook_name
    166 with trainer.profiler.profile(f\"[LightningModule]{pl_module.__class__.__name__}.{hook_name}\"):
--> 167     output = fn(*args, **kwargs)
    169 # restore current_fx when nested context
    170 pl_module._current_fx_name = prev_fx_name

File site-packages/lightning/pytorch/core/module.py:1306, in LightningModule.optimizer_step(self, epoch, batch_idx, optimizer, optimizer_closure)
   1275 def optimizer_step(
   1276     self,
   1277     epoch: int,
   (...)
   1280     optimizer_closure: Optional[Callable[[], Any]] = None,
   1281 ) -> None:
   1282     r\"\"\"Override this method to adjust the default way the :class:`~lightning.pytorch.trainer.trainer.Trainer` calls
   1283     the optimizer.
   1284 
   (...)
   1304 
   1305     \"\"\"
-> 1306     optimizer.step(closure=optimizer_closure)

File site-packages/lightning/pytorch/core/optimizer.py:153, in LightningOptimizer.step(self, closure, **kwargs)
    150     raise MisconfigurationException(\"When `optimizer.step(closure)` is called, the closure should be callable\")
    152 assert self._strategy is not None
--> 153 step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
    155 self._on_after_step()
    157 return step_output

File site-packages/lightning/pytorch/strategies/strategy.py:238, in Strategy.optimizer_step(self, optimizer, closure, model, **kwargs)
    236 # TODO(fabric): remove assertion once strategy's optimizer_step typing is fixed
    237 assert isinstance(model, pl.LightningModule)
--> 238 return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)

File site-packages/lightning/pytorch/plugins/precision/precision.py:122, in Precision.optimizer_step(self, optimizer, model, closure, **kwargs)
    120 \"\"\"Hook to run the optimizer step.\"\"\"
    121 closure = partial(self._wrap_closure, model, optimizer, closure)
--> 122 return optimizer.step(closure=closure, **kwargs)

File site-packages/torch/optim/optimizer.py:487, in Optimizer.profile_hook_step.<locals>.wrapper(*args, **kwargs)
    482         else:
    483             raise RuntimeError(
    484                 f\"{func} must return None or a tuple of (new_args, new_kwargs), but got {result}.\"
    485             )
--> 487 out = func(*args, **kwargs)
    488 self._optimizer_step_code()
    490 # call optimizer step post hooks

File site-packages/torch/optim/optimizer.py:91, in _use_grad_for_differentiable.<locals>._use_grad(self, *args, **kwargs)
     89     torch.set_grad_enabled(self.defaults[\"differentiable\"])
     90     torch._dynamo.graph_break()
---> 91     ret = func(self, *args, **kwargs)
     92 finally:
     93     torch._dynamo.graph_break()

File site-packages/torch/optim/sgd.py:112, in SGD.step(self, closure)
    110 if closure is not None:
    111     with torch.enable_grad():
--> 112         loss = closure()
    114 for group in self.param_groups:
    115     params: List[Tensor] = []

File site-packages/lightning/pytorch/plugins/precision/precision.py:108, in Precision._wrap_closure(self, model, optimizer, closure)
     95 def _wrap_closure(
     96     self,
     97     model: \"pl.LightningModule\",
     98     optimizer: Steppable,
     99     closure: Callable[[], Any],
    100 ) -> Any:
    101     \"\"\"This double-closure allows makes sure the ``closure`` is executed before the ``on_before_optimizer_step``
    102     hook is called.
    103 
   (...)
    106 
    107     \"\"\"
--> 108     closure_result = closure()
    109     self._after_closure(model, optimizer)
    110     return closure_result

File site-packages/lightning/pytorch/loops/optimization/automatic.py:144, in Closure.__call__(self, *args, **kwargs)
    142 @override
    143 def __call__(self, *args: Any, **kwargs: Any) -> Optional[Tensor]:
--> 144     self._result = self.closure(*args, **kwargs)
    145     return self._result.loss

File site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
    113 @functools.wraps(func)
    114 def decorate_context(*args, **kwargs):
    115     with ctx_factory():
--> 116         return func(*args, **kwargs)

File site-packages/lightning/pytorch/loops/optimization/automatic.py:138, in Closure.closure(self, *args, **kwargs)
    135     self._zero_grad_fn()
    137 if self._backward_fn is not None and step_output.closure_loss is not None:
--> 138     self._backward_fn(step_output.closure_loss)
    140 return step_output

File site-packages/lightning/pytorch/loops/optimization/automatic.py:239, in _AutomaticOptimization._make_backward_fn.<locals>.backward_fn(loss)
    238 def backward_fn(loss: Tensor) -> None:
--> 239     call._call_strategy_hook(self.trainer, \"backward\", loss, optimizer)

File site-packages/lightning/pytorch/trainer/call.py:319, in _call_strategy_hook(trainer, hook_name, *args, **kwargs)
    316     return None
    318 with trainer.profiler.profile(f\"[Strategy]{trainer.strategy.__class__.__name__}.{hook_name}\"):
--> 319     output = fn(*args, **kwargs)
    321 # restore current_fx when nested context
    322 pl_module._current_fx_name = prev_fx_name

File site-packages/lightning/pytorch/strategies/strategy.py:212, in Strategy.backward(self, closure_loss, optimizer, *args, **kwargs)
    209 assert self.lightning_module is not None
    210 closure_loss = self.precision_plugin.pre_backward(closure_loss, self.lightning_module)
--> 212 self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
    214 closure_loss = self.precision_plugin.post_backward(closure_loss, self.lightning_module)
    215 self.post_backward(closure_loss)

File site-packages/lightning/pytorch/plugins/precision/precision.py:72, in Precision.backward(self, tensor, model, optimizer, *args, **kwargs)
     52 @override
     53 def backward(  # type: ignore[override]
     54     self,
   (...)
     59     **kwargs: Any,
     60 ) -> None:
     61     r\"\"\"Performs the actual backpropagation.
     62 
     63     Args:
   (...)
     70 
     71     \"\"\"
---> 72     model.backward(tensor, *args, **kwargs)

File site-packages/lightning/pytorch/core/module.py:1101, in LightningModule.backward(self, loss, *args, **kwargs)
   1099     self._fabric.backward(loss, *args, **kwargs)
   1100 else:
-> 1101     loss.backward(*args, **kwargs)

File site-packages/torch/_tensor.py:581, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
    571 if has_torch_function_unary(self):
    572     return handle_torch_function(
    573         Tensor.backward,
    574         (self,),
   (...)
    579         inputs=inputs,
    580     )
--> 581 torch.autograd.backward(
    582     self, gradient, retain_graph, create_graph, inputs=inputs
    583 )

File site-packages/torch/autograd/__init__.py:347, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    342     retain_graph = create_graph
    344 # The reason we repeat the same comment below is that
    345 # some Python versions print out the first line of a multi-line function
    346 # calls in the traceback and some print out the last line
--> 347 _engine_run_backward(
    348     tensors,
    349     grad_tensors_,
    350     retain_graph,
    351     create_graph,
    352     inputs,
    353     allow_unreachable=True,
    354     accumulate_grad=True,
    355 )

File site-packages/torch/autograd/graph.py:825, in _engine_run_backward(t_outputs, *args, **kwargs)
    823     unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)
    824 try:
--> 825     return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
    826         t_outputs, *args, **kwargs
    827     )  # Calls into the C++ engine to run the backward pass
    828 finally:
    829     if attach_logging_hooks:

RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead."
}

Versions

Latest version of vision and torch

NicolasHug commented 3 weeks ago

Hi @shanalikhan , It's quite difficult for now to figure out whether the issue you're observing truely is related to torchvision. Are you able to share a much more minimal reproducing example?

shanalikhan commented 3 weeks ago

@NicolasHug Yes, i've shared the completed data loader and model as well in the above model since it fails on training therefore I had to share the complete code instead.

abhi-glitchhg commented 3 weeks ago

@shanalikhan if you could write a minimal example without using the lightning code base, it would help. As you can see the error log is too much convoluted.

shanalikhan commented 3 weeks ago

Umm, converting it without lightening code base will take some time, For now I've tried to switch from MPS to cpu and it worked fine. Give me time, I'll try with Google Colab to conclude its problem with MPS or not.

https://github.com/pytorch/pytorch/issues/80800

shanalikhan commented 2 weeks ago

So guys, I've tried with google colab and its working fine with GPU and CPU and issue persist on MPS - Using M3 at the moment.

https://github.com/Lightning-AI/pytorch-lightning/issues/20386

pytorch / vision

MPS Issue : view size is not compatible with input tensor's size and stride #8706

🐛 Describe the bug

Versions