Using `accelerator=ddp` results in `RuntimeError: Expected to mark a variable ready only once. `

🐛 Bug

A custom model which builds on top of the Wav2Vec2 model from the transformers library cannot be trained with Trainer(accelerator=ddp). However, the same model class with a custom DDP training loop functions properly.

Using the pytorch lightning trainer results in the following traceback:

Traceback (most recent call last):
  File "/home/nik/workspace/phd/repos/wav2vec_speaker_identification/playground/wav2vec2_ddp_training_pl.py", line 253, in <module>
    main_with_pl()
  File "/home/nik/workspace/phd/repos/wav2vec_speaker_identification/playground/wav2vec2_ddp_training_pl.py", line 178, in main_with_pl
    trainer.fit(model, datamodule=data)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 458, in fit
    self._run(model)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 756, in _run
    self.dispatch()
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 797, in dispatch
    self.accelerator.start_training(self)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 96, in start_training
    self.training_type_plugin.start_training(trainer)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 144, in start_training
    self._results = trainer.run_stage()
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 807, in run_stage
    return self.run_train()
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 869, in run_train
    self.train_loop.run_training_epoch()
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 490, in run_training_epoch
    batch_output = self.run_training_batch(batch, batch_idx, dataloader_idx)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 731, in run_training_batch
    self.optimizer_step(optimizer, opt_idx, batch_idx, train_step_and_backward_closure)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 424, in optimizer_step
    model_ref.optimizer_step(
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1403, in optimizer_step
    optimizer.step(closure=optimizer_closure)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 214, in step
    self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 134, in __optimizer_step
    trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 329, in optimizer_step
    self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 336, in run_optimizer_step
    self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 193, in optimizer_step
    optimizer.step(closure=lambda_closure, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/optim/optimizer.py", line 88, in wrapper
    return func(*args, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/autograd/grad_mode.py", line 28, in decorate_context
    return func(*args, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/optim/adam.py", line 66, in step
    loss = closure()
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 725, in train_step_and_backward_closure
    result = self.training_step_and_backward(
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 827, in training_step_and_backward
    self.backward(result, optimizer, opt_idx)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/trainer/training_loop.py", line 864, in backward
    result.closure_loss = self.trainer.accelerator.backward(
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 308, in backward
    output = self.precision_plugin.backward(
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 79, in backward
    model.backward(closure_loss, optimizer, opt_idx)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1275, in backward
    loss.backward(*args, **kwargs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/_tensor.py", line 255, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/autograd/function.py", line 87, in apply
    return self._forward_cls.backward(self, *args)  # type: ignore[attr-defined]
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/utils/checkpoint.py", line 138, in backward
    torch.autograd.backward(outputs_with_grad, args_with_grad)
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/torch/autograd/__init__.py", line 147, in backward
    Variable._execution_engine.run_backward(
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
Parameter at index 209 with name module.backbone.encoder.layers.11.final_layer_norm.weight has been marked as ready twice. This means that multiple autograd engine  hooks have fired for this particular parameter during this iteration.
Exception ignored in: <function tqdm.__del__ at 0x7f6d283533a0>
Traceback (most recent call last):
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/tqdm/std.py", line 1152, in __del__
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/tqdm/std.py", line 1306, in close
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/tqdm/std.py", line 1499, in display
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/tqdm/std.py", line 1155, in __str__
  File "/home/nik/.cache/pypoetry/virtualenvs/wav2vec-speaker-identification-tOHn1t-u-py3.8/lib/python3.8/site-packages/tqdm/std.py", line 1457, in format_dict
TypeError: cannot unpack non-iterable NoneType object

Process finished with exit code 1

Reproduction

To Reproduce

You can run the following script (with pytorch_lightning, torch and transformers as dependencies).

Setting USE_PYTORCH_LIGHTNING=True will result in the error pasted above, while USE_PYTORCH_LIGHTNING=False trains properly without any errors.

################################################################################
# set training settings used in script

NUM_STEPS = 1000
BATCH_SIZE = 2
NUM_GPUS = 2
USE_PYTORCH_LIGHTNING = True

################################################################################
# imports

import os

from typing import Union, List, Dict, Optional

import pytorch_lightning as pl

import torch as t
import torch.nn.functional as F
import torch.random
import torch.distributed as dist
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader

from transformers.models.wav2vec2 import Wav2Vec2Model

################################################################################
# model doing binary classification

class BinaryClassificationSequenceModel(pl.LightningModule):
    def __init__(self):
        super().__init__()

        self.backbone = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
        self.prediction_head = t.nn.Linear(in_features=768, out_features=2)

    def forward(self, input_sequence: t.Tensor):
        wav2vec2_tokens = self.backbone(input_sequence).last_hidden_state
        pooled_token = t.mean(wav2vec2_tokens, dim=1)
        class_score = self.prediction_head(pooled_token)

        return class_score

    def training_step(self, batch, batch_idx):
        input_sequence = batch["sequence"]
        input_labels = batch["label"]

        prediction = self(input_sequence)
        loss = F.cross_entropy(prediction, input_labels)

        return loss

    def validation_step(self, batch, batch_idx):
        input_sequence = batch["sequence"]
        input_labels = batch["label"]

        prediction = self(input_sequence)
        loss = F.cross_entropy(prediction, input_labels)

        return loss

    def test_step(self, batch, batch_idx):
        input_sequence = batch["sequence"]
        input_labels = batch["label"]

        prediction = self(input_sequence)
        loss = F.cross_entropy(prediction, input_labels)

        return loss

    def configure_optimizers(self):
        return t.optim.Adam(self.parameters(), 3e-3)

################################################################################
# model creating sequence data with 2 different distributions

class TwoDistDataset(t.utils.data.Dataset):
    def __init__(
        self,
        num_samples: int = 500,
        sequence_length: int = 48_000,
        mean_c1=-1,
        mean_c2=1,
        seed=123,
    ):
        self.num_samples_per_dist = num_samples // 2
        self.num_samples = self.num_samples_per_dist * 2
        self.sequence_length = sequence_length
        self.mean_c1 = mean_c1
        self.mean_c2 = mean_c2
        self.seed = seed

        data, labels = self._generate_data()

        self.data = data
        self.labels = labels

    def __getitem__(self, index):
        return {
            "sequence": t.clone(self.data[index]),
            "label": t.clone(self.labels[index]),
        }

    def __len__(self):
        return self.num_samples

    def _generate_data(self):
        # same data everywhere
        torch.random.manual_seed(self.seed)

        c1 = t.rand((self.num_samples_per_dist, self.sequence_length)) + self.mean_c1
        c2 = t.rand((self.num_samples_per_dist, self.sequence_length)) + self.mean_c2

        data = t.cat([c1, c2])
        labels = t.cat(
            [
                t.zeros((self.num_samples_per_dist,)),
                t.ones((self.num_samples_per_dist,)),
            ]
        )

        return data, labels.to(t.long)

class TwoDistSequenceDataModule(pl.LightningDataModule):
    def __init__(self, batch_size):
        super().__init__()
        self.batch_size = batch_size

        # initialized in setup()
        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None

    def setup(self, stage: Optional[str] = None) -> None:
        self.train_dataset = TwoDistDataset(seed=1)
        self.val_dataset = TwoDistDataset(seed=2)
        self.test_dataset = TwoDistDataset(seed=3)

    def train_dataloader(
        self,
    ) -> Union[DataLoader, List[DataLoader], Dict[str, DataLoader]]:
        return DataLoader(self.train_dataset, self.batch_size, shuffle=True)

    def val_dataloader(self) -> Union[DataLoader, List[DataLoader]]:
        return DataLoader(self.val_dataset, self.batch_size, shuffle=False)

    def test_dataloader(self) -> Union[DataLoader, List[DataLoader]]:
        return DataLoader(self.test_dataset, self.batch_size, shuffle=False)

################################################################################
# training with pytorch-lightning trainer

def main_with_pl():
    # create model
    model = BinaryClassificationSequenceModel()

    # create data
    data = TwoDistSequenceDataModule(batch_size=BATCH_SIZE)

    # create trainer
    trainer = pl.Trainer(
        max_steps=NUM_STEPS,
        num_sanity_val_steps=0,
        gpus=NUM_GPUS,
        accelerator="ddp",
    )

    # fit model on data
    trainer.fit(model, datamodule=data)

################################################################################
# training with custom DDP logic

def setup(rank, world_size):
    os.environ["MASTER_ADDR"] = "localhost"
    os.environ["MASTER_PORT"] = "12355"

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

def train_loop_ddp(rank, world_size):
    # set up ddp
    print(f"Running basic DDP example on {rank=} with {world_size=}.")
    setup(rank, world_size)

    # create model
    model = BinaryClassificationSequenceModel()

    # create data
    data = TwoDistSequenceDataModule(batch_size=BATCH_SIZE)
    data.prepare_data()
    data.setup()

    # move model and data to GPU
    model = model.to(rank)

    # set up DDP model
    ddp_model = DDP(model, device_ids=[rank], find_unused_parameters=True)
    optimizer = model.configure_optimizers()

    # training loop
    step = 0

    while step < NUM_STEPS:
        for batch in data.train_dataloader():
            step += 1

            if step >= NUM_STEPS:
                break

            batch = {k: v.to(rank) for k, v in batch.items()}

            optimizer.zero_grad()

            output = ddp_model(batch["sequence"])
            loss = F.cross_entropy(output, batch["label"])

            loss.backward()
            optimizer.step()

            if rank == 0:
                print(f"{step=:>4d}\tloss={loss.cpu().detach().numpy()}")

    cleanup()

def main_custom_ddp():
    mp.spawn(train_loop_ddp, args=(NUM_GPUS,), nprocs=NUM_GPUS, join=True)

################################################################################
# execute correct main function

if __name__ == "__main__":
    os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
    os.environ["PL_TORCH_DISTRIBUTED_BACKEND"] = "gloo"

    if USE_PYTORCH_LIGHTNING:
        main_with_pl()
    else:
        main_custom_ddp()

Expected behavior

Using the pytorch-lightningTrainer class should not result in a RuntimeError.

Environment

* CUDA:
        - GPU:
                - NVIDIA GeForce RTX 2080 Ti
                - NVIDIA GeForce RTX 2080 Ti
        - available:         True
        - version:           10.2
* Packages:
        - numpy:             1.21.0
        - pyTorch_debug:     False
        - pyTorch_version:   1.9.0+cu102
        - pytorch-lightning: 1.3.8
        - tqdm:              4.61.2
* System:
        - OS:                Linux
        - architecture:
                - 64bit
                - ELF
        - processor:         x86_64
        - python:            3.8.7
        - version:           #68~20.04.1-Ubuntu SMP Wed Jun 30 10:32:39 UTC 2021

Lightning-AI / pytorch-lightning