coqui-ai / TTS

🐸💬 - a deep learning toolkit for Text-to-Speech, battle-tested in research and production
http://coqui.ai
Mozilla Public License 2.0
34.45k stars 4.17k forks source link

VITS multi speaker not work.[Bug] #3431

Closed zjwang21 closed 8 months ago

zjwang21 commented 9 months ago

Describe the bug

After 130000 steps of vits multi-speaker training with aishell3, the generated wav belongs to one speaker voice regardless of the speaker_idx. I have debugged the inference process, which seems right. Anyone can give me some insights?

To Reproduce

{ "output_path": "/home/nfs02/wangzj/checkpoints/aishell3-new", "logger_uri": null, "run_name": "aishell3_new", "project_name": null, "run_description": "\ud83d\udc38Coqui trainer run.", "print_step": 1000, "plot_step": 100, "model_param_stats": false, "wandb_entity": null, "dashboard_logger": "tensorboard", "save_on_interrupt": true, "log_model_step": 10000, "save_step": 10000, "save_n_checkpoints": 5, "save_checkpoints": true, "save_all_best": false, "save_best_after": 10000, "target_loss": null, "print_eval": false, "test_delay_epochs": -1, "run_eval": true, "run_eval_steps": null, "distributed_backend": "nccl", "distributed_url": "tcp://localhost:54321", "mixed_precision": true, "precision": "fp16", "epochs": 1000, "batch_size": 64, "eval_batch_size": 16, "grad_clip": [ 1000, 1000 ], "scheduler_after_epoch": true, "lr": 0.001, "optimizer": "AdamW", "optimizer_params": { "betas": [ 0.8, 0.99 ], "eps": 1e-09, "weight_decay": 0.01 }, "lr_scheduler": null, "lr_scheduler_params": {}, "use_grad_scaler": false, "allow_tf32": false, "cudnn_enable": true, "cudnn_deterministic": false, "cudnn_benchmark": false, "training_seed": 54321, "model": "vits", "num_loader_workers": 4, "num_eval_loader_workers": 4, "use_noise_augment": false, "audio": { "fft_size": 1024, "sample_rate": 22050, "win_length": 1024, "hop_length": 256, "num_mels": 80, "mel_fmin": 0, "mel_fmax": null }, "use_phonemes": true, "phonemizer": "chinese_phonemzier", "phoneme_language": null, "compute_input_seq_cache": true, "text_cleaner": null, "enable_eos_bos_chars": false, "test_sentences_file": "", "phoneme_cache_path": "/home/nfs02/wangzj/checkpoints/aishell3-new/phoneme_cache", "characters": { "characters_class": "TTS.tts.utils.text.characters.PinyinPhonemes", "vocab_dict": null, "pad": "", "eos": "", "bos": "", "blank": "", "characters": "o3 uo3 er sh ou2 iou1 m ong3 ao2 vn1 ang2 e4 uei3 ian4 vn4 ia4 ve2 uai4 t #3 uang1 uan4 iao1 ang4 ve1 ai4 uang2 uang3 iang1 ong1 p iii ing4 ang1 ou1 ao1 an3 ii en4 v3 ao3 o4 #2 o2 uei4 ve3 uen3 i2 uan1 van4 x ie3 a ie1 iou4 an2 ou4 iao4 eos u4 ua2 z iong4 eng3 uang4 van1 er2 iao3 vn3 in3 a1 e1 en2 e3 i3 d i iii1 v4 o1 i1 ii4 n ia1 uei1 uo1 iong3 e2 van2 ueng1 ai2 ii2 ia ii3 iou2 c f l ian2 zh uan3 o uen4 ai1 vn2 ei2 ong2 ua ou3 e ing2 iang2 uo4 uan2 ie4 ei3 en3 in1 ia2 j er3 ang3 #0 u1 uo in2 ou eng2 s ei4 iou3 eng1 iang3 ao4 eng4 iii2 uai1 #1 ong4 iang4 sil ^ sp ii1 er4 uen1 ei b iao2 ia3 iii3 an1 uai3 ai3 a2 i4 ua4 en g uai2 uen2 iii4 an4 k ua3 a4 v2 ch ian3 u2 van3 ie2 v1 uo2 u3 ing1 ua1 in4 ei1 r en1 a3 ing3 ian1 uei2 h iong2 q iong1 ve4", "punctuations": "!'(),-.:;? ", "phonemes": null, "is_unique": false, "is_sorted": true }, "add_blank": false, "batch_group_size": 5, "loss_masking": null, "min_audio_len": 1, "max_audio_len": Infinity, "min_text_len": 1, "max_text_len": 512, "compute_f0": false, "compute_energy": false, "compute_linear_spec": true, "precompute_num_workers": 10, "start_by_longest": false, "shuffle": false, "drop_last": false, "datasets": [ { "formatter": "aishell3", "dataset_name": "", "path": "/home/nfs02/wangzj/dataset/aishell3", "meta_file_train": "", "ignored_speakers": null, "language": "", "phonemizer": "", "meta_file_val": "", "meta_file_attn_mask": "" } ], "test_sentences": [ "\u4eca\u5929\u5929\u6c14\u4e0d\u9519\u5440", "\u6211\u6bcf\u5468\u8fdb\u884c\u4e09\u6b21\u5065\u8eab", "\u4eca\u591c\u7684\u6c5f\u6ee9\u6ca1\u6709\u70df\u82b1", "\u8fd9\u4e2a\u6708\u6211\u53d1\u4e86\u4e5d\u5343\u516b\u767e\u4e03\u5341\u516d\u5757\u94b1\u7684\u5de5\u8d44" ], "eval_split_max_size": null, "eval_split_size": 0.01, "use_speaker_weighted_sampler": false, "speaker_weighted_sampler_alpha": 1.0, "use_language_weighted_sampler": false, "language_weighted_sampler_alpha": 1.0, "use_length_weighted_sampler": false, "length_weighted_sampler_alpha": 1.0, "model_args": { "num_chars": 205, "out_channels": 513, "spec_segment_size": 32, "hidden_channels": 192, "hidden_channels_ffn_text_encoder": 768, "num_heads_text_encoder": 2, "num_layers_text_encoder": 6, "kernel_size_text_encoder": 3, "dropout_p_text_encoder": 0.1, "dropout_p_duration_predictor": 0.5, "kernel_size_posterior_encoder": 5, "dilation_rate_posterior_encoder": 1, "num_layers_posterior_encoder": 16, "kernel_size_flow": 5, "dilation_rate_flow": 1, "num_layers_flow": 4, "resblock_type_decoder": "1", "resblock_kernel_sizes_decoder": [ 3, 7, 11 ], "resblock_dilation_sizes_decoder": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ], "upsample_rates_decoder": [ 8, 8, 2, 2 ], "upsample_initial_channel_decoder": 512, "upsample_kernel_sizes_decoder": [ 16, 16, 4, 4 ], "periods_multi_period_discriminator": [ 2, 3, 5, 7, 11 ], "use_sdp": true, "noise_scale": 1.0, "inference_noise_scale": 0.667, "length_scale": 1, "noise_scale_dp": 1.0, "inference_noise_scale_dp": 1.0, "max_inference_len": null, "init_discriminator": true, "use_spectral_norm_disriminator": false, "use_speaker_embedding": true, "num_speakers": 174, "speakers_file": "/home/nfs02/wangzj/checkpoints/aishell3-new/aishell3_new-December-12-2023_10+57PM-0000000/speakers.pth", "d_vector_file": null, "speaker_embedding_channels": 256, "use_d_vector_file": false, "d_vector_dim": 0, "detach_dp_input": true, "use_language_embedding": false, "embedded_language_dim": 4, "num_languages": 0, "language_ids_file": null, "use_speaker_encoder_as_loss": false, "speaker_encoder_config_path": "", "speaker_encoder_model_path": "", "condition_dp_on_speaker": true, "freeze_encoder": false, "freeze_DP": false, "freeze_PE": false, "freeze_flow_decoder": false, "freeze_waveform_decoder": false, "encoder_sample_rate": null, "interpolate_z": true, "reinit_DP": false, "reinit_text_encoder": false }, "lr_gen": 0.0002, "lr_disc": 0.0002, "lr_scheduler_gen": "ExponentialLR", "lr_scheduler_gen_params": { "gamma": 0.999875, "last_epoch": -1 }, "lr_scheduler_disc": "ExponentialLR", "lr_scheduler_disc_params": { "gamma": 0.999875, "last_epoch": -1 }, "kl_loss_alpha": 1.0, "disc_loss_alpha": 1.0, "gen_loss_alpha": 1.0, "feat_loss_alpha": 1.0, "mel_loss_alpha": 45.0, "dur_loss_alpha": 1.0, "speaker_encoder_loss_alpha": 1.0, "return_wav": true, "use_weighted_sampler": false, "weighted_sampler_attrs": {}, "weighted_sampler_multipliers": {}, "r": 1, "num_speakers": 174, "use_speaker_embedding": true, "speakers_file": "/home/nfs02/wangzj/checkpoints/aishell3-new/aishell3_new-December-12-2023_10+57PM-0000000/speakers.pth", "speaker_embedding_channels": 256, "language_ids_file": null, "use_language_embedding": false, "use_d_vector_file": false, "d_vector_file": null, "d_vector_dim": 0 }

Expected behavior

No response

Logs

No response

Environment

> Training Environment:
 | > Backend: Torch
 | > Mixed precision: True
 | > Precision: fp16
 | > Current device: 0
 | > Num. of GPUs: 1
 | > Num. of CPUs: 20
 | > Num. of Torch Threads: 20
 | > Torch seed: 54321
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False

1 v100-32G GPU

Additional context

No response

stale[bot] commented 8 months ago

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. You might also look our discussion channels.

nicemanis commented 7 months ago

How does the aishell3 data formatter look like?