Lightning-AI / pytorch-lightning

Pretrain, finetune ANY AI model of ANY size on multiple GPUs, TPUs with zero code changes.
https://lightning.ai
Apache License 2.0
28.19k stars 3.37k forks source link

TypeError: training_step_end() missing 1 required positional argument: 'batch_idx' #8302

Closed etetteh closed 3 years ago

etetteh commented 3 years ago

My first time using lightning. Basically, I am trying to convert the following code into lightning format:

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

avg_loss = 0.
avg_output_std = 0.
for e in range(epochs):

    for (x0, x1), _, _ in dataloader_train_simsiam:

        # move images to the gpu
        x0 = x0.to(device)
        x1 = x1.to(device)

        # run the model on both transforms of the images
        # the output of the simsiam model is a y containing the predictions
        # and projections for each input x
        y0, y1 = model(x0, x1)

        # backpropagation
        loss = criterion(y0, y1)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        # calculate the per-dimension standard deviation of the outputs
        # we can use this later to check whether the embeddings are collapsing
        output, _ = y0
        output = output.detach()
        output = torch.nn.functional.normalize(output, dim=1)

        output_std = torch.std(output, 0)
        output_std = output_std.mean()

        # use moving averages to track the loss and standard deviation
        w = 0.9
        avg_loss = w * avg_loss + (1 - w) * loss.item()
        avg_output_std = w * avg_output_std + (1 - w) * output_std.item()

    # the level of collapse is large if the standard deviation of the l2
    # normalized output is much smaller than 1 / sqrt(dim)
    collapse_level = max(0., 1 - math.sqrt(out_dim) * avg_output_std)
    # print intermediate results
    print(f'[Epoch {e:3d}] '
        f'Loss = {avg_loss:.2f} | '
        f'Collapse Level: {collapse_level:.2f} / 1.00')

What I have done so far is this:

class SimSiamModel(pl.LightningModule):
    def __init__(self, backbone, num_ftrs, pred_hidden_dim, out_dim, num_mlp_layers):
        super().__init__()

        # create a moco based on ResNet
        self.resnet_simsiam = \
            lightly.models.SimSiam(
                            backbone,
                            num_ftrs=num_ftrs,
                            proj_hidden_dim=pred_hidden_dim,
                            pred_hidden_dim=pred_hidden_dim,
                            out_dim=out_dim,
                            num_mlp_layers=num_mlp_layers
                        )

        # create our loss with the optional memory bank
        self.criterion = lightly.loss.SymNegCosineSimilarityLoss()

    def forward(self, x):
        self.resnet_simsiam(x)

    def training_step(self, batch, batch_idx):
        (x0, x1), _, _ = batch
        y0, y1 = self.resnet_simsiam(x0, x1)
        loss = self.criterion(y0, y1)
        self.log('train_loss_ss', loss)
        return loss, y0

    def training_step_end(self, batch, batch_idx):
        loss, y0 = self.training_step(self, batch, batch_idx)
        output, _ = y0
        output = output.detach()
        output = torch.nn.functional.normalize(output, dim=1)

        output_std = torch.std(output, 0)
        output_std = output_std.mean()

        # use moving averages to track the loss and standard deviation
        w = 0.9
        avg_loss = 0.
        avg_output_std = 0.
        avg_loss = w * avg_loss + (1 - w) * loss.item()
        avg_output_std = w * avg_output_std + (1 - w) * output_std.item()

        return avg_loss, avg_output_std

    def training_epoch_end(self, batch, batch_idx, ):
        avg_loss, avg_output_std = self.training_step_end(self, batch, batch_idx)
        collapse_level = max(0., 1 - math.sqrt(out_dim) * avg_output_std)

        self.log('loss', round(avg_loss,2), prog_bar=True)
        self.log('Collapse Level', round(collapse_evel,2), prog_bar=True)

    def configure_optimizers(self):
        lr = 0.05 * batch_size / 256
        optimizer = AdamP(self.resnet_simsiam.parameters(), lr=lr,
                                weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, epochs)
        return [optimizer], [scheduler]

When I run the code, I get the error:

TypeError                                 Traceback (most recent call last)
<ipython-input-16-eb464d96d0bd> in <module>
     18     trainer.fit(
     19         model,
---> 20         train_loader_simsiam
     21         )

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloader, val_dataloaders, datamodule)
    458         )
    459 
--> 460         self._run(model)
    461 
    462         assert self.state.stopped

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in _run(self, model)
    756 
    757         # dispatch `start_training` or `start_evaluating` or `start_predicting`
--> 758         self.dispatch()
    759 
    760         # plugin will finalized fitting (e.g. ddp_spawn will load trained model)

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in dispatch(self)
    797             self.accelerator.start_predicting(self)
    798         else:
--> 799             self.accelerator.start_training(self)
    800 
    801     def run_stage(self):

~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in start_training(self, trainer)
     94 
     95     def start_training(self, trainer: 'pl.Trainer') -> None:
---> 96         self.training_type_plugin.start_training(trainer)
     97 
     98     def start_evaluating(self, trainer: 'pl.Trainer') -> None:

~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
    142     def start_training(self, trainer: 'pl.Trainer') -> None:
    143         # double dispatch to initiate the training loop
--> 144         self._results = trainer.run_stage()
    145 
    146     def start_evaluating(self, trainer: 'pl.Trainer') -> None:

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
    807         if self.predicting:
    808             return self.run_predict()
--> 809         return self.run_train()
    810 
    811     def _pre_training_routine(self):

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in run_train(self)
    869                 with self.profiler.profile("run_training_epoch"):
    870                     # run train epoch
--> 871                     self.train_loop.run_training_epoch()
    872 
    873                 if self.max_steps and self.max_steps <= self.global_step:

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_epoch(self)
    497             # ------------------------------------
    498             with self.trainer.profiler.profile("run_training_batch"):
--> 499                 batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
    500 
    501             # when returning -1 from train_step, we end epoch early

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py in run_training_batch(self, batch, batch_idx, dataloader_idx)
    736 
    737                         # optimizer step
--> 738                         self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    739                         if len(self.trainer.optimizers) > 1:
    740                             # revert back to previous state

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py in optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
    440             on_tpu=self.trainer._device_type == DeviceType.TPU and _TPU_AVAILABLE,
    441             using_native_amp=using_native_amp,
--> 442             using_lbfgs=is_lbfgs,
    443         )
    444 

~/.local/lib/python3.6/site-packages/pytorch_lightning/core/lightning.py in optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, optimizer_closure, on_tpu, using_native_amp, using_lbfgs)
   1401 
   1402         """
-> 1403         optimizer.step(closure=optimizer_closure)
   1404 
   1405     def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int):

~/.local/lib/python3.6/site-packages/pytorch_lightning/core/optimizer.py in step(self, closure, *args, **kwargs)
    212             profiler_name = f"optimizer_step_and_closure_{self._optimizer_idx}"
    213 
--> 214         self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
    215         self._total_optimizer_step_calls += 1
    216 

~/.local/lib/python3.6/site-packages/pytorch_lightning/core/optimizer.py in __optimizer_step(self, closure, profiler_name, **kwargs)
    132 
    133         with trainer.profiler.profile(profiler_name):
--> 134             trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
    135 
    136     def step(self, *args, closure: Optional[Callable] = None, **kwargs):

~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in optimizer_step(self, optimizer, opt_idx, lambda_closure, **kwargs)
    327         )
    328         if make_optimizer_step:
--> 329             self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
    330         self.precision_plugin.post_optimizer_step(optimizer, opt_idx)
    331         self.training_type_plugin.post_optimizer_step(optimizer, opt_idx, **kwargs)

~/.local/lib/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py in run_optimizer_step(self, optimizer, optimizer_idx, lambda_closure, **kwargs)
    334         self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any
    335     ) -> None:
--> 336         self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
    337 
    338     def optimizer_zero_grad(self, current_epoch: int, batch_idx: int, optimizer: Optimizer, opt_idx: int) -> None:

~/.local/lib/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in optimizer_step(self, optimizer, lambda_closure, **kwargs)
    191 
    192     def optimizer_step(self, optimizer: torch.optim.Optimizer, lambda_closure: Callable, **kwargs):
--> 193         optimizer.step(closure=lambda_closure, **kwargs)
    194 
    195     @property

~/.local/lib/python3.6/site-packages/torch/optim/lr_scheduler.py in wrapper(*args, **kwargs)
     63                 instance._step_count += 1
     64                 wrapped = func.__get__(instance, cls)
---> 65                 return wrapped(*args, **kwargs)
     66 
     67             # Note that the returned function here is no longer a bound method,

~/.local/lib/python3.6/site-packages/torch/optim/optimizer.py in wrapper(*args, **kwargs)
     86                 profile_name = "Optimizer.step#{}.step".format(obj.__class__.__name__)
     87                 with torch.autograd.profiler.record_function(profile_name):
---> 88                     return func(*args, **kwargs)
     89             return wrapper
     90 

~/.local/lib/python3.6/site-packages/timm/optim/adamp.py in step(self, closure)
     56         loss = None
     57         if closure is not None:
---> 58             loss = closure()
     59 
     60         for group in self.param_groups:

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py in train_step_and_backward_closure()
    731                         def train_step_and_backward_closure():
    732                             result = self.training_step_and_backward(
--> 733                                 split_batch, batch_idx, opt_idx, optimizer, self.trainer.hiddens
    734                             )
    735                             return None if result is None else result.loss

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py in training_step_and_backward(self, split_batch, batch_idx, opt_idx, optimizer, hiddens)
    821         with self.trainer.profiler.profile("training_step_and_backward"):
    822             # lightning module hook
--> 823             result = self.training_step(split_batch, batch_idx, opt_idx, hiddens)
    824             self._curr_step_result = result
    825 

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/training_loop.py in training_step(self, split_batch, batch_idx, opt_idx, hiddens)
    293             self.trainer.logger_connector.cache_logged_metrics()
    294 
--> 295             training_step_output = self.trainer.call_hook("training_step_end", training_step_output)
    296 
    297             self._check_training_step_output(training_step_output)

~/.local/lib/python3.6/site-packages/pytorch_lightning/trainer/trainer.py in call_hook(self, hook_name, *args, **kwargs)
   1233             if is_overridden(hook_name, model_ref):
   1234                 hook_fx = getattr(model_ref, hook_name)
-> 1235                 output = hook_fx(*args, **kwargs)
   1236 
   1237             # if the PL module doesn't have the hook then call the accelerator

TypeError: training_step_end() missing 1 required positional argument: 'batch_idx'
tchaton commented 3 years ago

Dear @etetteh,

Here is the correct signature for training_step_end. It just takes outputs from the training_step for you to make more processing on them. batch_idx shouldn't be there.

    def training_step_end(self, outputs):
        # only use when  on dp
        outputs = torch.cat(outputs, dim=1)
        softmax = softmax(outputs, dim=1)
        out = softmax.mean()
        return out

Small advice. When implementing model with Lightning, use auto-completion from your IDE. It will automatically add the right function signature.

Best, T.C

etetteh commented 3 years ago

Thanks for the advice on the IDE. The idx issue got resolved, but the training_step_end function is not performing the expected job. My goal is to get the loss and y0 from the training_step, to perform the operations under the training_step_end