Lightning-AI / pytorch-lightning

Pretrain, finetune ANY AI model of ANY size on multiple GPUs, TPUs with zero code changes.
https://lightning.ai
Apache License 2.0
28.42k stars 3.39k forks source link

Finetuning callback crashes with DeepSpeed #9833

Closed SeanNaren closed 1 year ago

SeanNaren commented 3 years ago

πŸ› Bug

Port from https://github.com/microsoft/DeepSpeed/issues/1426

The Finetuning callback from Pytorch Lightning crashes when using Deepspeed plugin.

To Reproduce

import os

import torch
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader, Dataset

from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import BaseFinetuning
from pytorch_lightning.plugins import DeepSpeedPlugin

class FreezeBase(BaseFinetuning):
    def freeze_before_training(self, pl_module: LightningModule) -> None:
        self.freeze(pl_module.layer1)

    def finetune_function(
            self,
            pl_module: LightningModule,
            epoch: int,
            optimizer: Optimizer,
            opt_idx: int,
    ) -> None:
        pass

class TestDataset(Dataset):
    def __init__(self, size, length):
        self.len = length
        self.data = torch.randn(length, size)

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.len

class FinetuningModel(LightningModule):
    def __init__(self):
        super().__init__()
        self.layer1 = torch.nn.Linear(32, 32)
        self.layer2 = torch.nn.Linear(32, 2)

    def forward(self, x):
        return self.layer2(self.layer1(x))

    def training_step(self, batch, batch_idx):
        loss = self(batch).sum()
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        loss = self(batch).sum()

    def test_step(self, batch, batch_idx):
        loss = self(batch).sum()

    def configure_optimizers(self):
        return torch.optim.Adam(params = (filter(lambda p: p.requires_grad, self.parameters())), lr=0.1)

def run():
    train_data = DataLoader(TestDataset(32, 64), batch_size=2)
    val_data = DataLoader(TestDataset(32, 64), batch_size=2)
    test_data = DataLoader(TestDataset(32, 64), batch_size=2)

    model = FinetuningModel()
    trainer = Trainer(
        default_root_dir=os.getcwd(),
        fast_dev_run=True,
        plugins=DeepSpeedPlugin(stage=2),
        max_epochs=1,
        gpus=1,
        callbacks=[FreezeBase()]
    )
    trainer.fit(model, train_dataloaders=train_data, val_dataloaders=val_data)
    trainer.test(model, dataloaders=test_data)

if __name__ == "__main__":
    run()
tchaton commented 3 years ago

Hey @SeanNaren,

Do you think we should re-create the optimizers for DeepSpeed ?

Best, T.C

je-santos commented 3 years ago

Hi,

This might not be super related. But when trying to run the example above, I get:

You have not specified an optimizer or scheduler within the DeepSpeed config.Usingconfigure_optimizers to define optimizer and scheduler. Using /home/My_NAME/.cache/torch_extensions as PyTorch extensions root...

File "/home/My_NAME/.local/lib/python3.8/site-packages/torch/utils/cpp_extension.py", line 1430, in verify_ninja_availability raise RuntimeError("Ninja is required to load C++ extensions") RuntimeError: Ninja is required to load C++ extensions `

Am I missing something?

Many thanks @SeanNaren

Evm7 commented 2 years ago

Hi,

I am facing the same issue while trying to freeze some backbone layers for a multihead model.. Have someone been able to solve the problem?

ar90n commented 2 years ago

Hi there. I investigated this issue and found that the cause of it is that DeepSpeed flattens weight tensors in the model. Therefore, unflattening is a solution to this issue. The following modification is an example with optimizer.unflatten that is added by DeepSpeed.

diff --git a/pytorch_lightning/callbacks/finetuning.py b/pytorch_lightning/callbacks/finetuning.py
index 26ef742ee..2fa3a7880 100644
--- a/pytorch_lightning/callbacks/finetuning.py
+++ b/pytorch_lightning/callbacks/finetuning.py
@@ -258,12 +258,11 @@ class BaseFinetuning(Callback):

     def _store(
         self,
-        pl_module: "pl.LightningModule",
+        mapping: dict,
         opt_idx: int,
         num_param_groups: int,
         current_param_groups: List[Dict[str, Any]],
     ) -> None:
-        mapping = {p: n for n, p in pl_module.named_parameters()}
         if opt_idx not in self._internal_optimizer_metadata:
             self._internal_optimizer_metadata[opt_idx] = self._apply_mapping_to_param_groups(
                 current_param_groups, mapping
@@ -283,7 +282,22 @@ class BaseFinetuning(Callback):
             num_param_groups = len(optimizer.param_groups)
             self.finetune_function(pl_module, trainer.current_epoch, optimizer, opt_idx)
             current_param_groups = optimizer.param_groups
-            self._store(pl_module, opt_idx, num_param_groups, current_param_groups)
+            mapping = {p: n for n, p in pl_module.named_parameters()}
+
+            # DeepSpeed made optmizer's tensor flatten and assign unflattend method to it.
+            if len(current_param_groups[0]["params"]) == 1 and hasattr(optimizer, "unflatten"):
+                current_param_groups = [
+                    {
+                        "params": [
+                            tuple(p.flatten().tolist())
+                            for p in optimizer.unflatten(
+                                current_param_groups[0]["params"][0], optimizer.round_robin_bit16_groups[0]
+                            )
+                        ]
+                    }
+                ]
+                mapping = {tuple(p.flatten().tolist()): n for p, n in mapping.items()}
+            self._store(mapping, opt_idx, num_param_groups, current_param_groups)

As a result, we can get the following.

/workspaces/pytorch/pytorch-lightning/pytorch_lightning/plugins/training_type/deepspeed.py:20: LightningDeprecationWarning: The `pl.plugins.training_type.deepspeed.DeepSpeedPlugin` is deprecated in v1.6 and will be removed in v1.8. Use `pl.strategies.deepspeed.DeepSpeedStrategy` instead.
  rank_zero_deprecation(
/workspaces/pytorch/pytorch-lightning/pytorch_lightning/trainer/connectors/accelerator_connector.py:424: LightningDeprecationWarning: Setting `Trainer(gpus=1)` is deprecated in v1.7 and will be removed in v2.0. Please use `Trainer(accelerator='gpu', devices=1)` instead.
  rank_zero_deprecation(
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in fast_dev_run mode: will run the requested loop using 1 batch(es).
`Trainer(limit_train_batches=1)` was configured so 1 batch per epoch will be used.
`Trainer(limit_val_batches=1)` was configured so 1 batch will be used.
`Trainer(limit_test_batches=1)` was configured so 1 batch will be used.
`Trainer(limit_predict_batches=1)` was configured so 1 batch will be used.
`Trainer(val_check_interval=1.0)` was configured so validation will run at the end of the training epoch..
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Using /home/vscode/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
Emitting ninja build file /home/vscode/.cache/torch_extensions/py38_cu113/utils/build.ninja...
Building extension module utils...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module utils...
Time to load utils op: 0.408236026763916 seconds
Rank: 0 partition count [1] and sizes[(66, False)] 
Using /home/vscode/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.0002238750457763672 seconds

  | Name   | Type   | Params
----------------------------------
0 | layer1 | Linear | 1.1 K 
1 | layer2 | Linear | 66    
----------------------------------
66        Trainable params
1.1 K     Non-trainable params
1.1 K     Total params
0.004     Total estimated model params size (MB)
Epoch 0: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 2/2 [00:00<00:00, 160.42it/s, loss=0.00583, v_num=]
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/1                                                                                                    
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
You have not specified an optimizer or scheduler within the DeepSpeed config. Using `configure_optimizers` to define optimizer and scheduler.
Using /home/vscode/.cache/torch_extensions/py38_cu113 as PyTorch extensions root...
No modifications detected for re-loaded extension module utils, skipping build step...
Loading extension module utils...
Time to load utils op: 0.00019049644470214844 seconds
Testing DataLoader 0: 100%|β–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆβ–ˆ| 1/1 [00:00<00:00, 1256.16it/s]

Above workaround is messy and only work in storeing its internal state. So we have to add some codes to handle loading from checkpoint correctly. In BaseFinetuning, load_state_dict and on_fit_start are expected to run above features. In concrete, load_state_dict get _internal_optimizer_metadate from given state_dict and on_fit_start apply it to its optimizer. But this features cannot work with DeepSpeed. Because DeepSpeedStrategy always return True as a return value of restore_checkpoint_after_setup. As a result, on_fit_start is called before load_state_dict.

I don’t have any solutions for now. Do you have any good idea?

ar90n commented 2 years ago

To call on_fit_start before load_state_dict, is it possible to move calling self._restore_modules_and_callbacks(ckpt_path) before self._call_callback_hooks("on_fit_start”) as following.

 pytorch_lightning/trainer/trainer.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 71cb47b13..1e7c91838 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -1156,6 +1156,10 @@ class Trainer(
         # strategy will configure model and move it to the device
         self.strategy.setup(self)

+        if self.strategy.restore_checkpoint_after_setup:
+            log.detail(f"{self.__class__.__name__}: restoring module and callbacks from checkpoint path: {ckpt_path}")
+            self._restore_modules_and_callbacks(ckpt_path)
+
         # hook
         if self.state.fn == TrainerFn.FITTING:
             self._call_callback_hooks("on_fit_start")
@@ -1163,9 +1167,6 @@ class Trainer(

         self._log_hyperparams()

-        if self.strategy.restore_checkpoint_after_setup:
-            log.detail(f"{self.__class__.__name__}: restoring module and callbacks from checkpoint path: {ckpt_path}")
-            self._restore_modules_and_callbacks(ckpt_path)

         # restore optimizers, etc.
         log.detail(f"{self.__class__.__name__}: restoring training state")