Closed OmiWakode closed 2 years ago
Hi @OmiWakode ,
This notebook is old and compatible just with the Edresson/Coqui-TTS@multilingual-torchaudio-SE branch.
Currently, the TTS CLI works with the released YourTTS model. After install the latest version of Coqui TTS, You can do zero-shot multi-speaker TTS using the following command:
tts --model_name "tts_models/multilingual/multi-dataset/your_tts" --text "This is an example" --speaker_wav ../ref.wav --language_idx "en" --out_path ../output_test.wav
Just update the text, speaker_wav (a waveform with the voice of the target speaker), and out_path :).
Can I use this same script for zero shot cloning with my custom vits model?
I want to perform zero shot cloning with my custom vits model, or maybe try to follow YourTTS inference to my custom vits.
Can I use this same script for zero shot cloning with my custom vits model?
Yeah you can , but you need to provide remove the "--model_name" and add other params like: --model_path, --config_path, --speakers_file_path, and --language_ids_file_path (just if it is a multilingual model)
Hey, Tried that, again got the same error: Here is my code
!tts --config_path /config.json
--model_path best_model.pth
--speakers_file_path speakers.json
--text "This is an example"
--speaker_wav /content/drive/MyDrive/zero-shot-coqui-ai/videoplayback_1hr.wav
--out_path ../output_test.wav
Traceback (most recent call last):
File "/usr/local/bin/tts", line 8, in <module>
sys.exit(main())
File "/usr/local/lib/python3.7/dist-packages/TTS/bin/synthesize.py", line 287, in main
wav = synthesizer.tts(args.text, args.speaker_idx, args.language_idx, args.speaker_wav)
File "/usr/local/lib/python3.7/dist-packages/TTS/utils/synthesizer.py", line 260, in tts
d_vector=speaker_embedding,
File "/usr/local/lib/python3.7/dist-packages/TTS/tts/utils/synthesis.py", line 184, in synthesis
outputs = run_model_torch(model, text_inputs, speaker_id, style_mel, d_vector=d_vector, language_id=language_id)
File "/usr/local/lib/python3.7/dist-packages/TTS/tts/utils/synthesis.py", line 56, in run_model_torch
"language_ids": language_id,
File "/usr/local/lib/python3.7/dist-packages/TTS/tts/models/vits.py", line 974, in inference
lang_emb=lang_emb,
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/TTS/tts/layers/vits/stochastic_duration_predictor.py", line 233, in forward
x = x + self.cond(g)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 1178, in __getattr__
type(self).__name__, name))
AttributeError: 'StochasticDurationPredictor' object has no attribute 'cond'
Could you share your config.json?
Sure
{
"output_path": "/home/ubuntu/train_pipeline/ReChannelisedData/outputs",
"logger_uri": null,
"run_name": "last_vits",
"project_name": null,
"run_description": "\ud83d\udc38Coqui trainer run.",
"print_step": 25,
"plot_step": 100,
"model_param_stats": false,
"wandb_entity": null,
"dashboard_logger": "tensorboard",
"log_model_step": 10000,
"save_step": 10000,
"save_n_checkpoints": 5,
"save_checkpoints": true,
"save_all_best": false,
"save_best_after": 10000,
"target_loss": null,
"print_eval": false,
"test_delay_epochs": -1,
"run_eval": true,
"distributed_backend": "nccl",
"distributed_url": "tcp://localhost:54321",
"mixed_precision": true,
"epochs": 10,
"batch_size": 32,
"eval_batch_size": 16,
"grad_clip": [
1000,
1000
],
"scheduler_after_epoch": true,
"lr": 0.001,
"optimizer": "AdamW",
"optimizer_params": {
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"weight_decay": 0.01
},
"lr_scheduler": "",
"lr_scheduler_params": {},
"use_grad_scaler": false,
"cudnn_enable": true,
"cudnn_benchmark": true,
"torch_seed": 54321,
"model": "vits",
"num_loader_workers": 4,
"num_eval_loader_workers": 4,
"use_noise_augment": false,
"audio": {
"fft_size": 1024,
"win_length": 1024,
"hop_length": 256,
"frame_shift_ms": null,
"frame_length_ms": null,
"stft_pad_mode": "reflect",
"sample_rate": 22050,
"resample": true,
"preemphasis": 0.0,
"ref_level_db": 20,
"do_sound_norm": false,
"log_func": "np.log",
"do_trim_silence": true,
"trim_db": 23.0,
"do_rms_norm": false,
"db_level": null,
"power": 1.5,
"griffin_lim_iters": 60,
"num_mels": 80,
"mel_fmin": 0,
"mel_fmax": null,
"spec_gain": 1.0,
"do_amp_to_db_linear": false,
"do_amp_to_db_mel": true,
"pitch_fmax": 640.0,
"pitch_fmin": 0.0,
"signal_norm": false,
"min_level_db": -100,
"symmetric_norm": true,
"max_norm": 4.0,
"clip_norm": true,
"stats_path": null
},
"use_phonemes": false,
"phonemizer": null,
"phoneme_language": "en-us",
"compute_input_seq_cache": true,
"text_cleaner": "english_cleaners",
"enable_eos_bos_chars": false,
"test_sentences_file": "",
"phoneme_cache_path": null,
"characters": {
"characters_class": "TTS.tts.utils.text.characters.Graphemes",
"vocab_dict": null,
"pad": "<PAD>",
"eos": "<EOS>",
"bos": "<BOS>",
"blank": "<BLNK>",
"characters": "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
"punctuations": "!'(),-.:;? ",
"phonemes": null,
"is_unique": false,
"is_sorted": true
},
"add_blank": true,
"batch_group_size": 0,
"loss_masking": null,
"sort_by_audio_len": false,
"min_audio_len": 1,
"max_audio_len": Infinity,
"min_text_len": 1,
"max_text_len": 325,
"compute_f0": false,
"compute_linear_spec": true,
"precompute_num_workers": 0,
"start_by_longest": false,
"datasets": [
{
"name": "indian_english_speakers",
"path": "/home/ubuntu/train_pipeline/ReChannelisedData/",
"meta_file_train": "metadata.csv",
"ignored_speakers": null,
"language": "en-us",
"meta_file_val": "",
"meta_file_attn_mask": ""
}
],
"test_sentences": [
[
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
],
[
"Be a voice, not an echo."
],
[
"I'm sorry Dave. I'm afraid I can't do that."
],
[
"This cake is great. It's so delicious and moist."
],
[
"Prior to November 22, 1963."
]
],
"eval_split_max_size": null,
"eval_split_size": 0.01,
"use_speaker_weighted_sampler": false,
"speaker_weighted_sampler_alpha": 1.0,
"use_language_weighted_sampler": false,
"language_weighted_sampler_alpha": 1.0,
"model_args": {
"num_chars": 67,
"out_channels": 513,
"spec_segment_size": 32,
"hidden_channels": 192,
"hidden_channels_ffn_text_encoder": 768,
"num_heads_text_encoder": 2,
"num_layers_text_encoder": 6,
"kernel_size_text_encoder": 3,
"dropout_p_text_encoder": 0.1,
"dropout_p_duration_predictor": 0.5,
"kernel_size_posterior_encoder": 5,
"dilation_rate_posterior_encoder": 1,
"num_layers_posterior_encoder": 16,
"kernel_size_flow": 5,
"dilation_rate_flow": 1,
"num_layers_flow": 4,
"resblock_type_decoder": "1",
"resblock_kernel_sizes_decoder": [
3,
7,
11
],
"resblock_dilation_sizes_decoder": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates_decoder": [
8,
8,
2,
2
],
"upsample_initial_channel_decoder": 512,
"upsample_kernel_sizes_decoder": [
16,
16,
4,
4
],
"use_sdp": true,
"noise_scale": 1.0,
"inference_noise_scale": 0.667,
"length_scale": 1,
"noise_scale_dp": 1.0,
"inference_noise_scale_dp": 1.0,
"max_inference_len": null,
"init_discriminator": true,
"use_spectral_norm_disriminator": false,
"use_speaker_embedding": false,
"num_speakers": 10,
"speakers_file": "/home/ubuntu/train_pipeline/ReChannelisedData/outputs/last_vits-March-29-2022_03+33PM-0000000/speakers.json",
"d_vector_file": "/content/drive/MyDrive/Custom_coqui/MyTTSDataset/sample_indian_embed_try.json",
"speaker_embedding_channels": 256,
"use_d_vector_file": true,
"d_vector_dim": 0,
"detach_dp_input": true,
"use_language_embedding": false,
"embedded_language_dim": 4,
"num_languages": 0,
"language_ids_file": null,
"use_speaker_encoder_as_loss": false,
"speaker_encoder_config_path": "/content/drive/MyDrive/Custom_coqui/ASP-oficial-model-20220314T094714Z-001/H-ASP-oficial-model/config.json",
"speaker_encoder_model_path": "/content/drive/MyDrive/Custom_coqui/ASP-oficial-model-20220314T094714Z-001/H-ASP-oficial-model/converted_checkpoint.pth.tar",
"condition_dp_on_speaker": true,
"freeze_encoder": false,
"freeze_DP": false,
"freeze_PE": false,
"freeze_flow_decoder": false,
"freeze_waveform_decoder": false
},
"lr_gen": 0.0002,
"lr_disc": 0.0002,
"lr_scheduler_gen": "ExponentialLR",
"lr_scheduler_gen_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"lr_scheduler_disc": "ExponentialLR",
"lr_scheduler_disc_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"kl_loss_alpha": 1.0,
"disc_loss_alpha": 1.0,
"gen_loss_alpha": 1.0,
"feat_loss_alpha": 1.0,
"mel_loss_alpha": 45.0,
"dur_loss_alpha": 1.0,
"speaker_encoder_loss_alpha": 1.0,
"return_wav": true,
"r": 1,
"num_speakers": 0,
"use_speaker_embedding": false,
"speakers_file": "/home/ubuntu/train_pipeline/ReChannelisedData/outputs/last_vits-March-29-2022_03+33PM-0000000/speakers.json",
"speaker_embedding_channels": 256,
"language_ids_file": null,
"use_language_embedding": false,
"use_d_vector_file": true,
"d_vector_file": "/content/drive/MyDrive/Custom_coqui/MyTTSDataset/sample_indian_embed_try.json",
"d_vector_dim": 0
}
Hi @OmiWakode You need to define d_vector_dim in model_args and in the general config (at the end of the config.json file). Currently, in your config.json "d_vector_dim" is equal to zero. In this way, the model will create the layers necessary to condition the speaker embedding. Please set "d_vector_dim" to the external speaker embedding dim in all apparitions in the config file.
The issue is still an issue by the way.
Hey! I'm trying to run models with speaker consistency loss and the inference doesn't run:
import torch
from TTS.tts.utils.synthesis import synthesis
model_root = path/to/model
C = load_config(model_root + '/config.json')
state_dict = torch.load(model_root + '/best_model.pth')
model = setup_model(config=C)
model.load_state_dict(state_dict['model'])
res = synthesis(model, text="This is an example of the voice produced by multi speaker model.",
CONFIG=C,
use_cuda=False,
d_vector=torch.randn(512),
)
It returns the same error as in here.
AttributeError: 'StochasticDurationPredictor' object has no attribute 'cond'
Fun fact: defining d_vector_dim doesn't help anymore.
When I change config to:
C.model_args['use_d_vector_file'] = True
C.model_args['d_vector_file'] = TTS_SPEAKERS
it allows to partially load weights of model with strict=false
:
model.load_state_dict(model_weights, strict=False)
But the inference is of understandably terrible quality since some layers didn't load and contain pure noise.
Another option is to change config with:
C.model_args['d_vector_file'] = None
C.model_args['speaker_encoder_model_path'] = None
C.model_args['speaker_encoder_config_path'] = None
C.model_args['use_speaker_encoder_as_loss'] = False
C.model_args['use_d_vector_file'] = False
C.use_speaker_embedding = True
This way, weights will load without raising error, but since there's no trace of speaker encoder left in model, it would raise the same error as in OPs case:
AttributeError: 'StochasticDurationPredictor' object has no attribute 'cond'
Steps to reproduce:
CoquiTTS version: 0.8.0 My model config:
{
"output_path": "/home/frappuccino/recipes/out/",
"logger_uri": null,
"run_name": "vits_spk_consist",
"project_name": null,
"run_description": "\ud83d\udc38Coqui trainer run.",
"print_step": 25,
"plot_step": 100,
"model_param_stats": false,
"wandb_entity": null,
"dashboard_logger": "tensorboard",
"log_model_step": 10000,
"save_step": 10000,
"save_n_checkpoints": 5,
"save_checkpoints": true,
"save_all_best": false,
"save_best_after": 10000,
"target_loss": null,
"print_eval": false,
"test_delay_epochs": -1,
"run_eval": true,
"run_eval_steps": null,
"distributed_backend": "nccl",
"distributed_url": "tcp://localhost:54321",
"mixed_precision": false,
"epochs": 1000,
"batch_size": 32,
"eval_batch_size": 32,
"grad_clip": [
5.0,
5.0
],
"scheduler_after_epoch": true,
"lr": 0.001,
"optimizer": "AdamW",
"optimizer_params": {
"betas": [
0.8,
0.99
],
"eps": 1e-09,
"weight_decay": 0.01
},
"lr_scheduler": "",
"lr_scheduler_params": {},
"use_grad_scaler": false,
"cudnn_enable": true,
"cudnn_deterministic": false,
"cudnn_benchmark": false,
"training_seed": 54321,
"model": "vits",
"num_loader_workers": 12,
"num_eval_loader_workers": 1,
"use_noise_augment": false,
"audio": {
"fft_size": 1024,
"sample_rate": 16000,
"win_length": 1024,
"hop_length": 256,
"num_mels": 80,
"mel_fmin": 0,
"mel_fmax": null
},
"use_phonemes": true,
"phonemizer": "espeak",
"phoneme_language": "en",
"compute_input_seq_cache": true,
"text_cleaner": "english_cleaners",
"enable_eos_bos_chars": false,
"test_sentences_file": "",
"phoneme_cache_path": "/home/frappuccino/recipes/out/phoneme_cache",
"characters": {
"characters_class": "TTS.tts.utils.text.characters.IPAPhonemes",
"vocab_dict": null,
"pad": "#",
"eos": "$",
"bos": "^",
"blank": "_",
"characters": "iy\u0268\u0289\u026fu\u026a\u028f\u028ae\u00f8\u0258\u0259\u0275\u0264o\u025b\u0153\u025c\u025e\u028c\u0254\u00e6\u0250a\u0276\u0251\u0252\u1d7b\u0298\u0253\u01c0\u0257\u01c3\u0284\u01c2\u0260\u01c1\u029bpbtd\u0288\u0256c\u025fk\u0261q\u0262\u0294\u0274\u014b\u0272\u0273n\u0271m\u0299r\u0280\u2c71\u027e\u027d\u0278\u03b2fv\u03b8\u00f0sz\u0283\u0292\u0282\u0290\u00e7\u029dx\u0263\u03c7\u0281\u0127\u0295h\u0266\u026c\u026e\u028b\u0279\u027bj\u0270l\u026d\u028e\u029f\u02c8\u02cc\u02d0\u02d1\u028dw\u0265\u029c\u02a2\u02a1\u0255\u0291\u027a\u0267\u02b2\u025a\u02de\u026b",
"punctuations": "!'(),-.:;? ",
"phonemes": null,
"is_unique": true,
"is_sorted": true
},
"add_blank": true,
"batch_group_size": 0,
"loss_masking": null,
"min_audio_len": 1,
"max_audio_len": Infinity,
"min_text_len": 1,
"max_text_len": 500,
"compute_f0": false,
"compute_linear_spec": true,
"precompute_num_workers": 0,
"start_by_longest": false,
"datasets": [
{
"name": "libri_tts",
"path": "/home/frappuccino/data/librispeech16k/train-clean-360-16k/",
"meta_file_train": "",
"ignored_speakers": null,
"language": "en",
"meta_file_val": "",
"meta_file_attn_mask": ""
}
],
"test_sentences": [
[
"It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent."
],
[
"Be a voice, not an echo."
],
[
"I'm sorry Dave. I'm afraid I can't do that."
],
[
"This cake is great. It's so delicious and moist."
],
[
"Prior to November 22, 1963."
]
],
"eval_split_max_size": null,
"eval_split_size": 0.01,
"use_speaker_weighted_sampler": false,
"speaker_weighted_sampler_alpha": 1.0,
"use_language_weighted_sampler": false,
"language_weighted_sampler_alpha": 1.0,
"use_length_weighted_sampler": false,
"length_weighted_sampler_alpha": 1.0,
"model_args": {
"num_chars": 131,
"out_channels": 513,
"spec_segment_size": 62,
"hidden_channels": 192,
"hidden_channels_ffn_text_encoder": 768,
"num_heads_text_encoder": 2,
"num_layers_text_encoder": 10,
"kernel_size_text_encoder": 3,
"dropout_p_text_encoder": 0.1,
"dropout_p_duration_predictor": 0.5,
"kernel_size_posterior_encoder": 5,
"dilation_rate_posterior_encoder": 1,
"num_layers_posterior_encoder": 16,
"kernel_size_flow": 5,
"dilation_rate_flow": 1,
"num_layers_flow": 4,
"resblock_type_decoder": "2",
"resblock_kernel_sizes_decoder": [
3,
7,
11
],
"resblock_dilation_sizes_decoder": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"upsample_rates_decoder": [
8,
8,
2,
2
],
"upsample_initial_channel_decoder": 512,
"upsample_kernel_sizes_decoder": [
16,
16,
4,
4
],
"periods_multi_period_discriminator": [
2,
3,
5,
7,
11
],
"use_sdp": true,
"noise_scale": 1.0,
"inference_noise_scale": 0.3,
"length_scale": 1.5,
"noise_scale_dp": 0.6,
"inference_noise_scale_dp": 0.3,
"max_inference_len": null,
"init_discriminator": true,
"use_spectral_norm_disriminator": false,
"use_speaker_embedding": false,
"num_speakers": 902,
"speakers_file": "out/vits_spk_consist-October-20-2022_07+43AM-9017f31/speakers.pth",
"d_vector_file": null,
"speaker_embedding_channels": 512,
"use_d_vector_file": false,
"d_vector_dim": 512,
"detach_dp_input": true,
"use_language_embedding": false,
"embedded_language_dim": 4,
"num_languages": 0,
"language_ids_file": null,
"use_speaker_encoder_as_loss": true,
"speaker_encoder_config_path": "config_se.json",
"speaker_encoder_model_path": "SE_checkpoint.pth.tar",
"condition_dp_on_speaker": true,
"freeze_encoder": false,
"freeze_DP": false,
"freeze_PE": false,
"freeze_flow_decoder": false,
"freeze_waveform_decoder": false,
"encoder_sample_rate": null,
"interpolate_z": true,
"reinit_DP": false,
"reinit_text_encoder": false
},
"lr_gen": 0.0002,
"lr_disc": 0.0002,
"lr_scheduler_gen": "ExponentialLR",
"lr_scheduler_gen_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"lr_scheduler_disc": "ExponentialLR",
"lr_scheduler_disc_params": {
"gamma": 0.999875,
"last_epoch": -1
},
"kl_loss_alpha": 1.0,
"disc_loss_alpha": 1.0,
"gen_loss_alpha": 1.0,
"feat_loss_alpha": 1.0,
"mel_loss_alpha": 45.0,
"dur_loss_alpha": 1.0,
"speaker_encoder_loss_alpha": 9.0,
"return_wav": true,
"use_weighted_sampler": false,
"weighted_sampler_attrs": {},
"weighted_sampler_multipliers": {},
"r": 1,
"num_speakers": 902,
"use_speaker_embedding": false,
"speakers_file": "out/vits_spk_consist-October-20-2022_07+43AM-9017f31/speakers.pth",
"speaker_embedding_channels": 512,
"language_ids_file": null,
"use_language_embedding": false,
"use_d_vector_file": false,
"d_vector_file": null,
"d_vector_dim": 512
}
🐛 Description
AttributeError: 'StochasticDurationPredictor' object has no attribute 'cond' I have got this error while inferencing from the vits model, for zero shot learning. I have the following function for synthesis and have replaced d_vector with a speaker file whom I want to learn by zero shot cloning. (reference_emb is a d_vector generated from speaker encoder) I have used the script from here for inferencing.
Traceback of the error
Additional Information:
I have trained my Vits Model with a separate d_vector_file, and I want to make my custom YourTTS on my own data.
🐸TTS Version : v0.6.1
GPU models and configuration: Colab (Free)
Installed everything as mentioned in the above colab link