[Bug] How to train speedyspeech correctly on ljspeech dataset

Describe the bug

I've checked the history issues about speedyspeech, can not find the same question. In the past days, i have trained the speedyspeech model on ljspeech with 1000 epochs, the vocode used the pretrained hifigan model, the systhesised wav sounds not good, than i changed the asoustic speedyspeech model with pretrained model, the wav sounds good, and the synthesis command is

tts \
    --text "We follow the discriminator architecture of the multi-period
discriminator proposed in HiFi-GAN" \
    --model_path ./recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278/best_model_40373.pth \
    --config_path ./recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278/config.json \
    --out_path train_speedyspeech_hifigan.wav \
    --vocoder_path pretrained_model/vocoder_models--en--ljspeech--hifigan_v2/model_file.pth \
    --vocoder_config_path pretrained_model/vocoder_models--en--ljspeech--hifigan_v2/config.json

use the self trained model Audio synthesized using self trained models
use the pretrained model Audio synthesized using pretrained models

To Reproduce

1, I've used the recipes/ljspeech/speedy_speech/train_speedy_speech.py script with command CUDA_VISIBLE_DEVICES=0,1,2,3 python -m trainer.distribute --script train_speedy_speech.py. here is the code

import os

from trainer import Trainer, TrainerArgs

from TTS.config import BaseAudioConfig, BaseDatasetConfig
from TTS.tts.configs.speedy_speech_config import SpeedySpeechConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.forward_tts import ForwardTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

output_path = os.path.dirname(os.path.abspath(__file__))
dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="metadata_train.csv", meta_file_val="metadata_val.csv", path=os.path.join(output_path, "/data2/DATASET/LJSpeech-1.1")
)

audio_config = BaseAudioConfig(
    sample_rate=22050,
    do_trim_silence=True,
    trim_db=60.0,
    signal_norm=False,
    mel_fmin=0.0,
    mel_fmax=8000,
    spec_gain=1.0,
    log_func="np.log",
    ref_level_db=20,
    preemphasis=0.0,
)

config = SpeedySpeechConfig(
    run_name="speedy_speech_ljspeech",
    audio=audio_config,
    batch_size=64,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    compute_input_seq_cache=True,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    text_cleaner="english_cleaners",
    use_phonemes=True,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    precompute_num_workers=4,
    print_step=50,
    print_eval=False,
    mixed_precision=False,
    max_seq_len=500000,
    output_path=output_path,
    datasets=[dataset_config],
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# init model
model = ForwardTTS(config, ap, tokenizer)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)
# AND... 3,2,1... 🚀
trainer.fit()

2, here is the output config file code

{
    "output_path": "/data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech",
    "logger_uri": null,
    "run_name": "speedy_speech_ljspeech",
    "project_name": null,
    "run_description": "\ud83d\udc38Coqui trainer run.",
    "print_step": 50,
    "plot_step": 100,
    "model_param_stats": false,
    "wandb_entity": null,
    "dashboard_logger": "tensorboard",
    "log_model_step": null,
    "save_step": 10000,
    "save_n_checkpoints": 5,
    "save_checkpoints": true,
    "save_all_best": false,
    "save_best_after": 10000,
    "target_loss": null,
    "print_eval": false,
    "test_delay_epochs": -1,
    "run_eval": true,
    "run_eval_steps": null,
    "distributed_backend": "nccl",
    "distributed_url": "tcp://localhost:54321",
    "mixed_precision": false,
    "epochs": 1000,
    "batch_size": 64,
    "eval_batch_size": 16,
    "grad_clip": 5.0,
    "scheduler_after_epoch": true,
    "lr": 0.0001,
    "optimizer": "Adam",
    "optimizer_params": {
        "betas": [
            0.9,
            0.998
        ],
        "weight_decay": 1e-06
    },
    "lr_scheduler": "NoamLR",
    "lr_scheduler_params": {
        "warmup_steps": 4000
    },
    "use_grad_scaler": false,
    "cudnn_enable": true,
    "cudnn_deterministic": false,
    "cudnn_benchmark": false,
    "training_seed": 54321,
    "model": "speedy_speech",
    "num_loader_workers": 4,
    "num_eval_loader_workers": 4,
    "use_noise_augment": false,
    "audio": {
        "fft_size": 1024,
        "win_length": 1024,
        "hop_length": 256,
        "frame_shift_ms": null,
        "frame_length_ms": null,
        "stft_pad_mode": "reflect",
        "sample_rate": 22050,
        "resample": false,
        "preemphasis": 0.0,
        "ref_level_db": 20,
        "do_sound_norm": false,
        "log_func": "np.log",
        "do_trim_silence": true,
        "trim_db": 60.0,
        "do_rms_norm": false,
        "db_level": null,
        "power": 1.5,
        "griffin_lim_iters": 60,
        "num_mels": 80,
        "mel_fmin": 0.0,
        "mel_fmax": 8000,
        "spec_gain": 1.0,
        "do_amp_to_db_linear": true,
        "do_amp_to_db_mel": true,
        "pitch_fmax": 640.0,
        "pitch_fmin": 1.0,
        "signal_norm": false,
        "min_level_db": -100,
        "symmetric_norm": true,
        "max_norm": 4.0,
        "clip_norm": true,
        "stats_path": null
    },
    "use_phonemes": true,
    "phonemizer": "espeak",
    "phoneme_language": "en-us",
    "compute_input_seq_cache": true,
    "text_cleaner": "english_cleaners",
    "enable_eos_bos_chars": false,
    "test_sentences_file": "",
    "phoneme_cache_path": "/data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech/phoneme_cache",
    "characters": {
        "characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
        "vocab_dict": null,
        "pad": "<PAD>",
        "eos": "<EOS>",
        "bos": "<BOS>",
        "blank": "<BLNK>",
        "characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
        "punctuations": "!'(),-.:;? ",
        "phonemes": null,
        "is_unique": false,
        "is_sorted": true
    },
    "add_blank": false,
    "batch_group_size": 0,
    "loss_masking": null,
    "min_audio_len": 1,
    "max_audio_len": Infinity,
    "min_text_len": 1,
    "max_text_len": Infinity,
    "compute_f0": false,
    "compute_energy": false,
    "compute_linear_spec": false,
    "precompute_num_workers": 4,
    "start_by_longest": false,
    "shuffle": false,
    "drop_last": false,
    "datasets": [
        {
            "formatter": "ljspeech",
            "dataset_name": "",
            "path": "/data2/DATASET/LJSpeech-1.1",
            "meta_file_train": "metadata_train.csv",
            "ignored_speakers": null,
            "language": "",
            "phonemizer": "",
            "meta_file_val": "metadata_val.csv",
            "meta_file_attn_mask": ""
        }
    ],
    "test_sentences": [
        "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
        "Be a voice, not an echo.",
        "I'm sorry Dave. I'm afraid I can't do that.",
        "This cake is great. It's so delicious and moist.",
        "Prior to November 22, 1963."
    ],
    "eval_split_max_size": null,
    "eval_split_size": 0.01,
    "use_speaker_weighted_sampler": false,
    "speaker_weighted_sampler_alpha": 1.0,
    "use_language_weighted_sampler": false,
    "language_weighted_sampler_alpha": 1.0,
    "use_length_weighted_sampler": false,
    "length_weighted_sampler_alpha": 1.0,
    "base_model": "forward_tts",
    "model_args": {
        "num_chars": 131,
        "out_channels": 80,
        "hidden_channels": 128,
        "use_aligner": true,
        "use_pitch": false,
        "pitch_predictor_hidden_channels": 256,
        "pitch_predictor_kernel_size": 3,
        "pitch_predictor_dropout_p": 0.1,
        "pitch_embedding_kernel_size": 3,
        "use_energy": false,
        "energy_predictor_hidden_channels": 256,
        "energy_predictor_kernel_size": 3,
        "energy_predictor_dropout_p": 0.1,
        "energy_embedding_kernel_size": 3,
        "duration_predictor_hidden_channels": 256,
        "duration_predictor_kernel_size": 3,
        "duration_predictor_dropout_p": 0.1,
        "positional_encoding": true,
        "poisitonal_encoding_use_scale": true,
        "length_scale": 1,
        "encoder_type": "residual_conv_bn",
        "encoder_params": {
            "kernel_size": 4,
            "dilations": [
                1,
                2,
                4,
                1,
                2,
                4,
                1,
                2,
                4,
                1,
                2,
                4,
                1
            ],
            "num_conv_blocks": 2,
            "num_res_blocks": 13
        },
        "decoder_type": "residual_conv_bn",
        "decoder_params": {
            "kernel_size": 4,
            "dilations": [
                1,
                2,
                4,
                8,
                1,
                2,
                4,
                8,
                1,
                2,
                4,
                8,
                1,
                2,
                4,
                8,
                1
            ],
            "num_conv_blocks": 2,
            "num_res_blocks": 17
        },
        "detach_duration_predictor": true,
        "max_duration": 75,
        "num_speakers": 1,
        "use_speaker_embedding": false,
        "speakers_file": null,
        "use_d_vector_file": false,
        "d_vector_dim": null,
        "d_vector_file": null
    },
    "num_speakers": 0,
    "speakers_file": null,
    "use_speaker_embedding": false,
    "use_d_vector_file": false,
    "d_vector_file": false,
    "d_vector_dim": 0,
    "spec_loss_type": "l1",
    "duration_loss_type": "huber",
    "use_ssim_loss": false,
    "ssim_loss_alpha": 1.0,
    "dur_loss_alpha": 1.0,
    "spec_loss_alpha": 1.0,
    "aligner_loss_alpha": 1.0,
    "binary_align_loss_alpha": 0.3,
    "binary_loss_warmup_epochs": 150,
    "min_seq_len": 13,
    "max_seq_len": 500000,
    "r": 1,
    "f0_cache_path": null,
    "github_branch": "* dev"
}

Expected behavior

In the speedyspeech paper, it is necessary to first train a Teacher Model to extract durations, and then use the extracted durations to train the speedspeed Student model. However, in our code, there is no step to train a Teacher Model, and I see If the alignment network is used, the model learns the text-to-speech alignment from the data instead of using pre-computed durations. in TTS/tts/models/forward_tts.py ,LINE 177 . I am a bit confused about this.Is this the reason why the model I trained doesn't perform well

Logs

Here is the last log information during training

[4m[1m > EPOCH: 998/1000[0m
 --> /data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278

[1m > TRAINING (2023-12-10 00:42:38) [0m

[1m   --> STEP: 44/47 -- GLOBAL_STEP: 46950[0m
     | > loss_spec: 0.95118  (0.95257)
     | > loss_dur: 0.00258  (0.00194)
     | > loss_aligner: 4.66062  (4.61889)
     | > loss_binary_alignment: 0.60001  (0.59970)
     | > loss: 6.21439  (6.17310)
     | > duration_error: 3.77120  (3.68030)
     | > grad_norm: 7.32043  (7.81596)
     | > current_lr: 0.00002 
     | > step_time: 0.91460  (1.79724)
     | > loader_time: 3.51170  (1.37181)

[1m > EVALUATION [0m

  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[91m 0.34830 [0m(+0.08128)
     | > avg_loss_spec:[92m 2.09409 [0m(-0.85148)
     | > avg_loss_dur:[91m 0.01837 [0m(+0.00047)
     | > avg_loss_aligner:[92m 4.61922 [0m(-0.00192)
     | > avg_loss_binary_alignment:[91m 0.61072 [0m(+0.00081)
     | > avg_loss:[92m 7.34240 [0m(-0.85212)
     | > avg_duration_error:[91m 3.90552 [0m(+0.00616)

[4m[1m > EPOCH: 999/1000[0m
 --> /data1/zpsun/workspace/TTS/TTS/recipes/ljspeech/speedy_speech/speedy_speech_ljspeech-December-07-2023_06+37PM-e4c5c278

[1m > TRAINING (2023-12-10 00:45:53) [0m

[1m > EVALUATION [0m

  [1m--> EVAL PERFORMANCE[0m
     | > avg_loader_time:[92m 0.28263 [0m(-0.06568)
     | > avg_loss_spec:[92m 2.02141 [0m(-0.07268)
     | > avg_loss_dur:[92m 0.01793 [0m(-0.00045)
     | > avg_loss_aligner:[92m 4.61829 [0m(-0.00093)
     | > avg_loss_binary_alignment:[91m 0.61084 [0m(+0.00012)
     | > avg_loss:[92m 7.26846 [0m(-0.07394)
     | > avg_duration_error:[92m 3.90462 [0m(-0.00090)


### Environment

```shell
{
    "CUDA": {
        "GPU": [
            "NVIDIA GeForce RTX 3090",
            "NVIDIA GeForce RTX 3090",
            "NVIDIA GeForce RTX 3090",
            "NVIDIA GeForce RTX 3090"
        ],
        "available": true,
        "version": "11.7"
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "2.0.0+cu117",
        "TTS": "0.13.2",
        "numpy": "1.21.6"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            "ELF"
        ],
        "processor": "x86_64",
        "python": "3.8.16",
        "version": "#224-Ubuntu SMP Mon Jun 19 13:30:12 UTC 2023"
    }
}

Additional context

No response

coqui-ai / TTS