coqui-ai / TTS

🐸💬 - a deep learning toolkit for Text-to-Speech, battle-tested in research and production
http://coqui.ai
Mozilla Public License 2.0
33.52k stars 4.08k forks source link

[Bug] How to train speedyspeech correctly on ljspeech dataset #3400

Closed Henryplay closed 7 months ago

Henryplay commented 9 months ago

Describe the bug

I've checked the history issues about speedyspeech, can not find the same question. In the past days, i have trained the speedyspeech model on ljspeech with 1000 epochs, the vocode used the pretrained hifigan model, the systhesised wav sounds not good, than i changed the asoustic speedyspeech model with pretrained model, the wav sounds good, and the synthesis command is

tts \
    --text "We follow the discriminator architecture of the multi-period
discriminator proposed in HiFi-GAN" \
    --model_path ./recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278/best_model_40373.pth \
    --config_path ./recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278/config.json \
    --out_path train_speedyspeech_hifigan.wav \
    --vocoder_path pretrained_model/vocoder_models--en--ljspeech--hifigan_v2/model_file.pth \
    --vocoder_config_path pretrained_model/vocoder_models--en--ljspeech--hifigan_v2/config.json

To Reproduce

1, I've used the recipes/ljspeech/speedy_speech/train_speedy_speech.py script with command CUDA_VISIBLE_DEVICES=0,1,2,3 python -m trainer.distribute --script train_speedy_speech.py. here is the code

import os

from trainer import Trainer, TrainerArgs

from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata_train.csv", meta_file_val="metadata_val.csv", path=os.path.join(output_path, "/data2/DATASET/LJSpeech-1.1")
)

audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

config = SpeedySpeechConfig(
    run_name="speedy_speech_ljspeech",
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    compute_input_seq_cache=True,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    precompute_num_workers=4,
    print_step=50,
    print_eval=False,
    mixed_precision=False,
    max_seq_len=500000,
    output_path=output_path,
    datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# init model
model = ForwardTTS(config, ap, tokenizer)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

2, here is the output config file code

{
    "output_path": "/data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech",
    "logger_uri": null,
    "run_name": "speedy_speech_ljspeech",
    "project_name": null,
    "run_description": "\ud83d\udc38Coqui trainer run.",
    "print_step": 50,
    "plot_step": 100,
    "model_param_stats": false,
    "wandb_entity": null,
    "dashboard_logger": "tensorboard",
    "log_model_step": null,
    "save_step": 10000,
    "save_n_checkpoints": 5,
    "save_checkpoints": true,
    "save_all_best": false,
    "save_best_after": 10000,
    "target_loss": null,
    "print_eval": false,
    "test_delay_epochs": -1,
    "run_eval": true,
    "run_eval_steps": null,
    "distributed_backend": "nccl",
    "distributed_url": "tcp://localhost:54321",
    "mixed_precision": false,
    "epochs": 1000,
    "batch_size": 64,
    "eval_batch_size": 16,
    "grad_clip": 5.0,
    "scheduler_after_epoch": true,
    "lr": 0.0001,
    "optimizer": "Adam",
    "optimizer_params": {
        "betas": [
            0.9,
            0.998
        ],
        "weight_decay": 1e-06
    },
    "lr_scheduler": "NoamLR",
    "lr_scheduler_params": {
        "warmup_steps": 4000
    },
    "use_grad_scaler": false,
    "cudnn_enable": true,
    "cudnn_deterministic": false,
    "cudnn_benchmark": false,
    "training_seed": 54321,
    "model": "speedy_speech",
    "num_loader_workers": 4,
    "num_eval_loader_workers": 4,
    "use_noise_augment": false,
    "audio": {
        "fft_size": 1024,
        "win_length": 1024,
        "hop_length": 256,
        "frame_shift_ms": null,
        "frame_length_ms": null,
        "stft_pad_mode": "reflect",
        "sample_rate": 22050,
        "resample": false,
        "preemphasis": 0.0,
        "ref_level_db": 20,
        "do_sound_norm": false,
        "log_func": "np.log",
        "do_trim_silence": true,
        "trim_db": 60.0,
        "do_rms_norm": false,
        "db_level": null,
        "power": 1.5,
        "griffin_lim_iters": 60,
        "num_mels": 80,
        "mel_fmin": 0.0,
        "mel_fmax": 8000,
        "spec_gain": 1.0,
        "do_amp_to_db_linear": true,
        "do_amp_to_db_mel": true,
        "pitch_fmax": 640.0,
        "pitch_fmin": 1.0,
        "signal_norm": false,
        "min_level_db": -100,
        "symmetric_norm": true,
        "max_norm": 4.0,
        "clip_norm": true,
        "stats_path": null
    },
    "use_phonemes": true,
    "phonemizer": "espeak",
    "phoneme_language": "en-us",
    "compute_input_seq_cache": true,
    "text_cleaner": "english_cleaners",
    "enable_eos_bos_chars": false,
    "test_sentences_file": "",
    "phoneme_cache_path": "/data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech/phoneme_cache",
    "characters": {
        "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
        "vocab_dict": null,
        "pad": "<PAD>",
        "eos": "<EOS>",
        "bos": "<BOS>",
        "blank": "<BLNK>",
        "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
        "punctuations": "!'(),-.:;? ",
        "phonemes": null,
        "is_unique": false,
        "is_sorted": true
    },
    "add_blank": false,
    "batch_group_size": 0,
    "loss_masking": null,
    "min_audio_len": 1,
    "max_audio_len": Infinity,
    "min_text_len": 1,
    "max_text_len": Infinity,
    "compute_f0": false,
    "compute_energy": false,
    "compute_linear_spec": false,
    "precompute_num_workers": 4,
    "start_by_longest": false,
    "shuffle": false,
    "drop_last": false,
    "datasets": [
        {
            "formatter": "ljspeech",
            "dataset_name": "",
            "path": "/data2/DATASET/LJSpeech-1.1",
            "meta_file_train": "metadata_train.csv",
            "ignored_speakers": null,
            "language": "",
            "phonemizer": "",
            "meta_file_val": "metadata_val.csv",
            "meta_file_attn_mask": ""
        }
    ],
    "test_sentences": [
        "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
        "Be a voice, not an echo.",
        "I'm sorry Dave. I'm afraid I can't do that.",
        "This cake is great. It's so delicious and moist.",
        "Prior to November 22, 1963."
    ],
    "eval_split_max_size": null,
    "eval_split_size": 0.01,
    "use_speaker_weighted_sampler": false,
    "speaker_weighted_sampler_alpha": 1.0,
    "use_language_weighted_sampler": false,
    "language_weighted_sampler_alpha": 1.0,
    "use_length_weighted_sampler": false,
    "length_weighted_sampler_alpha": 1.0,
    "base_model": "forward_tts",
    "model_args": {
        "num_chars": 131,
        "out_channels": 80,
        "hidden_channels": 128,
        "use_aligner": true,
        "use_pitch": false,
        "pitch_predictor_hidden_channels": 256,
        "pitch_predictor_kernel_size": 3,
        "pitch_predictor_dropout_p": 0.1,
        "pitch_embedding_kernel_size": 3,
        "use_energy": false,
        "energy_predictor_hidden_channels": 256,
        "energy_predictor_kernel_size": 3,
        "energy_predictor_dropout_p": 0.1,
        "energy_embedding_kernel_size": 3,
        "duration_predictor_hidden_channels": 256,
        "duration_predictor_kernel_size": 3,
        "duration_predictor_dropout_p": 0.1,
        "positional_encoding": true,
        "poisitonal_encoding_use_scale": true,
        "length_scale": 1,
        "encoder_type": "residual_conv_bn",
        "encoder_params": {
            "kernel_size": 4,
            "dilations": [
                1,
                2,
                4,
                1,
                2,
                4,
                1,
                2,
                4,
                1,
                2,
                4,
                1
            ],
            "num_conv_blocks": 2,
            "num_res_blocks": 13
        },
        "decoder_type": "residual_conv_bn",
        "decoder_params": {
            "kernel_size": 4,
            "dilations": [
                1,
                2,
                4,
                8,
                1,
                2,
                4,
                8,
                1,
                2,
                4,
                8,
                1,
                2,
                4,
                8,
                1
            ],
            "num_conv_blocks": 2,
            "num_res_blocks": 17
        },
        "detach_duration_predictor": true,
        "max_duration": 75,
        "num_speakers": 1,
        "use_speaker_embedding": false,
        "speakers_file": null,
        "use_d_vector_file": false,
        "d_vector_dim": null,
        "d_vector_file": null
    },
    "num_speakers": 0,
    "speakers_file": null,
    "use_speaker_embedding": false,
    "use_d_vector_file": false,
    "d_vector_file": false,
    "d_vector_dim": 0,
    "spec_loss_type": "l1",
    "duration_loss_type": "huber",
    "use_ssim_loss": false,
    "ssim_loss_alpha": 1.0,
    "dur_loss_alpha": 1.0,
    "spec_loss_alpha": 1.0,
    "aligner_loss_alpha": 1.0,
    "binary_align_loss_alpha": 0.3,
    "binary_loss_warmup_epochs": 150,
    "min_seq_len": 13,
    "max_seq_len": 500000,
    "r": 1,
    "f0_cache_path": null,
    "github_branch": "* dev"
}

Expected behavior

In the speedyspeech paper, it is necessary to first train a Teacher Model to extract durations, and then use the extracted durations to train the speedspeed Student model. However, in our code, there is no step to train a Teacher Model, and I see If the alignment network is used, the model learns the text-to-speech alignment from the data instead of using pre-computed durations. in TTS/tts/models/forward_tts.py ,LINE 177 . I am a bit confused about this.Is this the reason why the model I trained doesn't perform well

Logs

Here is the last log information during training

 > EPOCH: 998/1000
 --> /data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278

 > TRAINING (2023-12-10 00:42:38) 

   --> STEP: 44/47 -- GLOBAL_STEP: 46950
     | > loss_spec: 0.95118  (0.95257)
     | > loss_dur: 0.00258  (0.00194)
     | > loss_aligner: 4.66062  (4.61889)
     | > loss_binary_alignment: 0.60001  (0.59970)
     | > loss: 6.21439  (6.17310)
     | > duration_error: 3.77120  (3.68030)
     | > grad_norm: 7.32043  (7.81596)
     | > current_lr: 0.00002 
     | > step_time: 0.91460  (1.79724)
     | > loader_time: 3.51170  (1.37181)

 > EVALUATION 

  --> EVAL PERFORMANCE
     | > avg_loader_time: 0.34830 (+0.08128)
     | > avg_loss_spec: 2.09409 (-0.85148)
     | > avg_loss_dur: 0.01837 (+0.00047)
     | > avg_loss_aligner: 4.61922 (-0.00192)
     | > avg_loss_binary_alignment: 0.61072 (+0.00081)
     | > avg_loss: 7.34240 (-0.85212)
     | > avg_duration_error: 3.90552 (+0.00616)

 > EPOCH: 999/1000
 --> /data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278

 > TRAINING (2023-12-10 00:45:53) 

 > EVALUATION 

  --> EVAL PERFORMANCE
     | > avg_loader_time: 0.28263 (-0.06568)
     | > avg_loss_spec: 2.02141 (-0.07268)
     | > avg_loss_dur: 0.01793 (-0.00045)
     | > avg_loss_aligner: 4.61829 (-0.00093)
     | > avg_loss_binary_alignment: 0.61084 (+0.00012)
     | > avg_loss: 7.26846 (-0.07394)
     | > avg_duration_error: 3.90462 (-0.00090)

### Environment

```shell
{
    "CUDA": {
        "GPU": [
            "NVIDIA GeForce RTX 3090",
            "NVIDIA GeForce RTX 3090",
            "NVIDIA GeForce RTX 3090",
            "NVIDIA GeForce RTX 3090"
        ],
        "available": true,
        "version": "11.7"
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "2.0.0+cu117",
        "TTS": "0.13.2",
        "numpy": "1.21.6"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            "ELF"
        ],
        "processor": "x86_64",
        "python": "3.8.16",
        "version": "#224-Ubuntu SMP Mon Jun 19 13:30:12 UTC 2023"
    }
}

Additional context

No response

stale[bot] commented 8 months ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. You might also look our discussion channels.