coqui-ai / TTS

🐸💬 - a deep learning toolkit for Text-to-Speech, battle-tested in research and production
http://coqui.ai
Mozilla Public License 2.0
34.7k stars 4.21k forks source link

[Bug] Possible Trainer bug ( RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! ) #1979

Closed jreus closed 2 years ago

jreus commented 2 years ago

Describe the bug

Hey all. I'm trying to train a Capacitron model at the moment and keep running into a device error RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (stacktrace below). I'm basically following the structure of the training recipe verbatim (using a custom dataset formatter function).

I post this issue here because I was under the impression that Trainer should be making sure all the tensors get put on the gpu. Does this issue look familiar to anyone?

To Reproduce

TrainCapacitron.py

import argparse
from argparse import Namespace
import os
from pathlib import Path
from math import floor

import torch

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
from TTS.tts.configs.tacotron2_config import Tacotron2Config
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.tacotron2 import Tacotron2
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

# custom formatter
def dataset_formatter(root_path: str, manifest_file: str, **kwargs) -> list:  # pylint: disable=unused-argument
    """
    Assumes a LJSpeech-style metadata csv file (with an added speaker name column)
    where each row is `<filename>|<raw transcription>|<clean transcription>|<speaker_name>`
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    audio_path = "wavs22050"
    with open(txt_file, "r", encoding="utf-8") as ttf:
        for line in ttf:
          line = line.strip() # Remove trailing \n
          cols = line.split("|")
          speaker_name = cols[3]
          wav_file = os.path.join(root_path, audio_path, speaker_name, cols[0])
          text = cols[1]
          items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name})
    return items

def train_capacitron(args: Namespace) -> None:
    # Using LJSpeech like dataset processing for the blizzard dataset (but we will use our own formatter)
    dataset_config = BaseDatasetConfig(
        name="ljspeech",
        meta_file_train="ljmetadata_combined.csv",
        path=args.dataset_path,
    )

    audio_config = BaseAudioConfig(
        sample_rate=args.sample_rate,  # we resampled Blizzard2013 to 22050
        do_trim_silence=True,
        trim_db=60.0,
        signal_norm=True,
        mel_fmin=args.mel_fmin,
        mel_fmax=args.mel_fmax,    # nyquist of sample_rate
        spec_gain=25.0,
        log_func="np.log10",
        ref_level_db=20,
        preemphasis=0.0,
        min_level_db=-100,
    )

    # Using the standard Capacitron config
    capacitron_config = CapacitronVAEConfig(capacitron_VAE_loss_alpha=1.0)

    config = Tacotron2Config(
        run_name="CAPACITRON-T2-BLIZZARD",
        audio=audio_config,
        capacitron_vae=capacitron_config,
        use_capacitron_vae=True,
        batch_size=args.batch_size,  # 246 Tune this to your gpu
        max_audio_len=floor(args.max_audio_len * 22050),  # 6 * 22050 Tune this to your gpu
        min_audio_len=1 * 22050,
        eval_batch_size=16,
        num_loader_workers=args.num_loader_workers,
        num_eval_loader_workers=8,
        precompute_num_workers=args.num_precompute_workers,
        run_eval=True,
        test_delay_epochs=5,

        r=2,
        optimizer="CapacitronOptimizer",
        optimizer_params={"RAdam": {"betas": [0.9, 0.998], "weight_decay": 1e-6}, "SGD": {"lr": 1e-5, "momentum": 0.9}},
        attention_type="dynamic_convolution",
        grad_clip=0.0,  # Important! We overwrite the standard grad_clip with capacitron_grad_clip
        double_decoder_consistency=False,

        epochs=10, # Only 10 epochs for testing purposes
        #epochs=1000,

        text_cleaner="phoneme_cleaners",
        use_phonemes=True,
        phoneme_language="en-us",
        phonemizer="espeak",
        phoneme_cache_path=os.path.join(args.output_path, "Capacitron/Blizzard" "phoneme_cache"),
        stopnet_pos_weight=15,
        print_step=25,
        print_eval=True,
        mixed_precision=False,
        output_path=os.path.join(args.output_path, "Capacitron"),
        datasets=[dataset_config],

        lr=1e-3,
        lr_scheduler="StepwiseGradualLR",
        lr_scheduler_params={
            "gradual_learning_rates": [
                [0, 1e-3],
                [2e4, 5e-4],
                [4e4, 3e-4],
                [6e4, 1e-4],
                [8e4, 5e-5],
            ]
        },
        scheduler_after_epoch=False,  # scheduler doesn't work without this flag
        seq_len_norm=True,
        loss_masking=False,

        decoder_loss_alpha=1.0,
        postnet_loss_alpha=1.0,
        postnet_diff_spec_alpha=1.0,
        decoder_diff_spec_alpha=1.0,
        decoder_ssim_alpha=1.0,
        postnet_ssim_alpha=1.0,
    )

    ap = AudioProcessor(**config.audio.to_dict())

    tokenizer, config = TTSTokenizer.init_from_config(config)

    # parse dataset using our custom dataset formatter
    train_samples, eval_samples = load_tts_samples(dataset_config, eval_split=True, formatter=dataset_formatter)

    model = Tacotron2(config, ap, tokenizer, speaker_manager=None)

    # Train!
    trainer = Trainer(
        TrainerArgs(),
        config,
        os.path.join(args.output_path, "Capacitron"),
        model=model,
        train_samples=train_samples,
        eval_samples=eval_samples,
        training_assets={"audio_processor": ap},
    )

    trainer.fit()

if __name__ == "__main__":

    parser = argparse.ArgumentParser(
        description="""Capacitron Model Trainer.\n\n"""
    )

    parser.add_argument(
        "--dataset_path","-i",
        type=Path,
        required=True,
        default=None,
        help="Path to training dataset.",
    )

    parser.add_argument(
        "--output_path","-o",
        type=Path,
        required=True,
        default=None,
        help="Path to write training output.",
    )

    parser.add_argument(
        "--sample_rate",
        type=int,
        default=22050,
        help="AudioProcessor sampling rate (default 22050)",
    )

    parser.add_argument(
        "--mel_fmin",
        type=float,
        default=80,
        help="Minimum Mel Frequency Bin, tune lower for male voices (default 80hz)",
    )

    parser.add_argument(
        "--mel_fmax",
        type=float,
        default=11025,
        help="Maximum Mel Frequency Bin (default 11025hz)",
    )

    parser.add_argument(
        "--num_loader_workers",
        type=int,
        default=12,
        help="Number of data loader threads (default 12)",
    )

    parser.add_argument(
        "--num_precompute_workers",
        type=int,
        default=24,
        help="Number of feature precomputing worker threads (default 24)",
    )

    parser.add_argument(
        "--batch_size",
        type=int,
        default=256,
        help="Tune this to your GPU's available memory (default 256)",
    )

    parser.add_argument(
        "--max_audio_len",
        type=float,
        default=6,
        help="In seconds (multiplied by sample rate) - tune this to your GPU's available memory (default 6s)",
    )

    args = parser.parse_args()

    args.dataset_path = args.dataset_path.resolve()
    args.output_path = args.output_path.resolve()

    train_capacitron(args)

Expected behavior

All tensors on the GPU

Logs


> Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:80
 | > mel_fmax:11025
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:25.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60.0
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 | > Found 40565 files in /media/jon/Store/R2201_datasets/Blizzard2013_Segmented
 > Using CUDA: True
 > Number of GPUs: 1

 > Model has 30339130 parameters

 > EPOCH: 0/10
 --> /media/jon/Store/R2201_outputs/Capacitron/CAPACITRON-T2-BLIZZARD-September-12-2022_10+12PM-e3144f4

> DataLoader initialization
| > Tokenizer:
        | > add_blank: False
        | > use_eos_bos: False
        | > use_phonemes: True
        | > phonemizer:
                | > phoneme language: en-us
                | > phoneme backend: espeak
| > Number of instances : 40160
 | > Preprocessing samples
 | > Max text length: 139
 | > Min text length: 3
 | > Avg text length: 51.75073736195898
 | 
 | > Max audio length: 132120.0
 | > Min audio length: 22089.0
 | > Avg audio length: 65252.51515879004
 | > Num. instances discarded samples: 11002
 | > Batch group size: 0.

> TRAINING (2022-09-12 21:01:21) 
/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/TTS/tts/models/tacotron2.py:335: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
  alignment_lengths = mel_lengths // self.decoder.r
 ! Run is removed from /media/jon/Store/R2201_outputs/Capacitron/CAPACITRON-T2-BLIZZARD-September-12-2022_09+01PM-d8a0284
Traceback (most recent call last):
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/trainer/trainer.py", line 1533, in fit
    self._fit()
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/trainer/trainer.py", line 1517, in _fit
    self.train_epoch()
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/trainer/trainer.py", line 1282, in train_epoch
    _, _ = self.train_step(batch, batch_num_steps, cur_step, loader_start_time)
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/trainer/trainer.py", line 1114, in train_step
    outputs, loss_dict_new, step_time = self._optimize(
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/trainer/trainer.py", line 998, in _optimize
    outputs, loss_dict = self._model_train_step(batch, model, criterion)
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/trainer/trainer.py", line 954, in _model_train_step
    return model.train_step(*input_args)
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/TTS/tts/models/tacotron2.py", line 339, in train_step
    loss_dict = criterion(
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/TTS/tts/layers/losses.py", line 440, in forward
    self.criterion_st(stopnet_output, stopnet_target, stop_target_length)
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/TTS/tts/layers/losses.py", line 193, in forward
    loss = functional.binary_cross_entropy_with_logits(
  File "/home/jon/miniconda3/envs/coqui/lib/python3.9/site-packages/torch/nn/functional.py", line 3150, in binary_cross_entropy_with_logits
    return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

### Environment

```shell
{
    "CUDA": {
        "GPU": [
            "NVIDIA GeForce RTX 3070 Laptop GPU"
        ],
        "available": true,
        "version": "11.6"
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "1.12.1",
        "Trainer": "v0.0.13",
        "numpy": "1.21.6"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            "ELF"
        ],
        "processor": "x86_64",
        "python": "3.9.12",
        "version": "#51~20.04.1-Ubuntu SMP Fri Jan 7 06:51:40 UTC 2022"
    }
}

Additional context

No response

erogol commented 2 years ago

It is not about the trainer. It is in 🐸TTS. Can you try the latest TTS version?

erogol commented 2 years ago

@WeberJulian can you take a look at this?

WeberJulian commented 2 years ago

Using your recipe, I couldn't reproduce the error you show here, training is running fine. I'm using TTS v0.7.1 btw it seems like v0.8.0 breaks something else, investigating. Try installing TTS from scratch in a new env if checking out to v0.7.1 doesn't solve the issue for you.

  --> STEP: 0/18 -- GLOBAL_STEP: 0
     | > decoder_loss: 5.80364  (5.80364)
     | > postnet_loss: 7.56972  (7.56972)
     | > capaciton_reconstruction_loss: 26195.39648  (26195.39648)
     | > capacitron_vae_loss: -0.01047  (-0.01047)
     | > capacitron_vae_beta_loss: 145.79236  (145.79236)
     | > capacitron_vae_kl_term: 4.20764  (4.20764)
     | > capacitron_beta: 1.00000  (1.00000)
     | > stopnet_loss: 1.00547  (1.00547)
     | > ga_loss: 0.00614  (0.00614)
     | > decoder_diff_spec_loss: 0.44172  (0.44172)
     | > postnet_diff_spec_loss: 3.98933  (3.98933)
     | > decoder_ssim_loss: 0.83536  (0.83536)
     | > postnet_ssim_loss: 0.83291  (0.83291)
     | > loss: 20.49835  (20.49835)
     | > align_error: 0.93405  (0.93405)
     | > grad_norm: 0.00000  (0.00000)
     | > current_lr: 0.00100 
     | > step_time: 1.20270  (1.20274)
     | > loader_time: 0.92890  (0.92894)