RuntimeError with Mixed CUDA Devices for Multi-GPU Training

Tested versions

pyannote-audio 3.1.1
cuda version 11.8
torch 2.2.1+cu118

System information

SUSE Linux 15 SP3 - pyannote.audio-3.1.1 - ALCF Polaris

Issue description

I tried to train a VAD model with 4 GPUs on a single node. The error message is: Traceback (most recent call last): File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/pipelines/vad.py", line 157, in <module> model = train_model(args, protocol) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/pipelines/vad.py", line 118, in train_model trainer.fit(model) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 544, in fit call._call_and_handle_interrupt( File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 43, in _call_and_handle_interrupt return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pytorch_lightning/strategies/launchers/subprocess_script.py", line 105, in launch return function(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 580, in _fit_impl self._run(model, ckpt_path=ckpt_path) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 949, in _run call._call_setup_hook(self) # allow user to set up LightningModule in accelerator environment File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 94, in _call_setup_hook _call_lightning_module_hook(trainer, "setup", stage=fn) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pytorch_lightning/trainer/call.py", line 157, in _call_lightning_module_hook output = fn(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pyannote/audio/core/model.py", line 264, in setup _ = self.example_output File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/functools.py", line 967, in __get__ val = self.func(instance) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pyannote/audio/core/model.py", line 195, in example_output example_output = self(example_input_array) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pyannote/audio/models/segmentation/PyanNet.py", line 172, in forward outputs = self.sincnet(waveforms) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/pyannote/audio/models/blocks/sincnet.py", line 81, in forward outputs = self.wav_norm1d(waveforms) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1511, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1520, in _call_impl return forward_call(*args, **kwargs) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/instancenorm.py", line 87, in forward return self._apply_instance_norm(input) File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/modules/instancenorm.py", line 36, in _apply_instance_norm return F.instance_norm( File "/lus/grand/projects/BPC/ra/shiyanglai/conv_rec_framework/polaris/build/pyannote2/lib/python3.8/site-packages/torch/nn/functional.py", line 2526, in instance_norm return torch.instance_norm( RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:3 and cuda:0! (when checking argument for argument weight in method wrapper_CUDA__cudnn_batch_norm) [rank: 1] Child process with PID 35924 terminated with code 1. Forcefully terminating all other processes to avoid zombies 🧟 /var/spool/pbs/mom_priv/jobs/1788633.polaris-pbs-01.hsn.cm.polaris.alcf.anl.gov.SC: line 37: 35559 Killed python vad.py

It seems to me like the data was not loaded to GPU devices. Below is a simplified version of my training script:

def train_model(args, protocol):
    vad = VoiceActivityDetection(protocol, duration=2., batch_size=256,
                                        num_workers=args['gpus'])
    model = PyanNet(
            task=vad,
            sincnet={
                        'stride': args['stride']
                        },
            lstm={
                        'num_layers': args['lstm_layer'],
                        'bidirectional': args['bidirectional'], 
                        'hidden_size': args['hidden_size'], 
                        'dropout': args['dropout']
                        },
            linear={
                        'num_layers': args['linear_layer']
                        })

    trainer = pl.Trainer(
                                    strategy='deepspeed_stage_2',   # deepspeed_stage_2
                                    max_time=args['time'],
                                    max_epochs=max_epoch,
                                    default_root_dir=OUTPUT_DIR+args['corpus_name']+'/', 
                                    devices="auto", 
                                    accelerator="auto",
                                    use_distributed_sampler=True,
                                    enable_progress_bar=True)
    print('start training...')
    model.to(torch.device("cuda")) 
    trainer.fit(model)
    print("trained successfully!")
    return model

if __name__ == '__main__':

    # 1. get module configuration
    args = read_config(CONFIGURATION)

    # 2. check the allocations and return rank
    check_devices(args)

    # 3. read data
    protocol = load_data(args['corpus_name'], args['sub_corpus'])

    # 4. train the model and save checkpoints
    model = train_model(args, protocol)

    # 5. tune the pipeline
    pipeline = tune_pipeline(model, protocol)

    # 6. evaluate the performance of the pipeline
    test_performance(pipeline, protocol, calculate_detection_error_rate, args['job_id'], args['corpus_name'])

And here is my bash script:

#!/bin/sh

#PBS -A ABC
#PBS -N pyannote
#PBS -l filesystems=home
#PBS -l walltime=00:30:00
#PBS -l select=1:system=polaris
#PBS -l place=scatter
#PBS -q debug
#PBS -k doe

module load PrgEnv-gnu/8.3.3
module load llvm/release-15.0.0
module load conda/2023-10-04

LD_LIBRARY_PATH=$( tr : $'\n' <<<$LD_LIBRARY_PATH | grep -v cudnn | paste -s -d : )

conda activate pyannote2

# Enable GPU-MPI
export MPICH_GPU_SUPPORT_ENABLED=1
export CUDA_VISIBLE_DEVICES=0,1,2,3
export CUDA_DEVICE_MAX_CONNECTIONS=1

cd ../conv_rec_framework/pipelines/

python vad.py

Minimal reproduction example (MRE)

None

pyannote / pyannote-audio