Closed VafaKnm closed 6 months ago
Hey @VafaKnm, could you send me the script/config you're using ?
From what I understand, your torch_files
doesn't have a config.json. What steps did you follow here?
Here is the steps i followed:
# Install dependencies
sudo apt-get install festival espeak-ng mbrola
# Install phonemizer
pip install phonemizer
git clone https://github.com/isi-nlp/uroman.git
cd uroman
export UROMAN=$(pwd)
cd <path-to-finetune-hf-vits-repo>
python convert_original_discriminator_checkpoint.py --language_code fas --pytorch_dump_folder_path finetune-hf-vits/torch_files
accelerate launch run_vits_finetuning.py /finetune-hf-vits/persian_tts_train_conf.json
this is the "persian_tts_train_conf.json" parameters:
{
"project_name": "mms_persion_finetuning",
"push_to_hub": false,
"report_to": ["tensorboard"],
"overwrite_output_dir": true,
"output_dir": "/home/user1/finetune-hf-vits/output_models",
"dataset_name": "/home/user1/finetune-hf-vits/TTS_data/new_data_version/farshad_tts_ds",
"dataset_config_name": "male",
"audio_column_name": "audio",
"text_column_name":"text",
"train_split_name": "/home/user1/finetune-hf-vits/TTS_data/new_data_version/farshad_tts_train_ds",
"eval_split_name": "/home/user1/finetune-hf-vits/TTS_data/new_data_version/farshad_tts_val_ds",
"speaker_id_column_name": "speaker_id",
"override_speaker_embeddings": true,
"filter_on_speaker_id": 1,
"full_generation_sample_text": "سلام این یک جمله به زبان فارسی است",
"max_duration_in_seconds": 40.709,
"min_duration_in_seconds": 0.019,
"max_tokens_length": 879,
"model_name_or_path": "/home/user1/finetune-hf-vits/torch_files/model.safetensors",
"preprocessing_num_workers": 2,
"do_train": true,
"num_train_epochs": 200,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": false,
"per_device_train_batch_size": 16,
"learning_rate": 2e-5,
"adam_beta1": 0.8,
"adam_beta2": 0.99,
"warmup_ratio": 0.01,
"group_by_length": false,
"do_eval": true,
"eval_steps": 50,
"per_device_eval_batch_size": 16,
"max_eval_samples": 25,
"do_step_schedule_per_epoch": true,
"weight_disc": 3,
"weight_fmaps": 1,
"weight_gen": 1,
"weight_kl": 1.5,
"weight_duration": 1,
"weight_mel": 35,
"fp16": true,
"seed": 456
}
json files inside the pytorch_dump_folder_path folder that were created:
{
"<unk>": 44
}
{
"activation_dropout": 0.1,
"architectures": [
"VitsModelForPreTraining"
],
"attention_dropout": 0.1,
"depth_separable_channels": 2,
"depth_separable_num_layers": 3,
"discriminator_kernel_size": 5,
"discriminator_period_channels": [
1,
32,
128,
512,
1024
],
"discriminator_periods": [
2,
3,
5,
7,
11
],
"discriminator_scale_channels": [
1,
16,
64,
256,
1024
],
"discriminator_stride": 3,
"duration_predictor_dropout": 0.5,
"duration_predictor_filter_channels": 256,
"duration_predictor_flow_bins": 10,
"duration_predictor_kernel_size": 3,
"duration_predictor_num_flows": 4,
"duration_predictor_tail_bound": 5.0,
"ffn_dim": 768,
"ffn_kernel_size": 3,
"flow_size": 192,
"hidden_act": "relu",
"hidden_dropout": 0.1,
"hidden_size": 192,
"hop_length": 256,
"initializer_range": 0.02,
"layer_norm_eps": 1e-05,
"layerdrop": 0.1,
"leaky_relu_slope": 0.1,
"model_type": "vits",
"noise_scale": 0.667,
"noise_scale_duration": 0.8,
"num_attention_heads": 2,
"num_hidden_layers": 6,
"num_speakers": 1,
"posterior_encoder_num_wavenet_layers": 16,
"prior_encoder_num_flows": 4,
"prior_encoder_num_wavenet_layers": 4,
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"resblock_kernel_sizes": [
3,
7,
11
],
"sampling_rate": 16000,
"segment_size": 8192,
"speaker_embedding_size": 0,
"speaking_rate": 1.0,
"spectrogram_bins": 513,
"torch_dtype": "float32",
"transformers_version": "4.37.2",
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4
],
"upsample_rates": [
8,
8,
2,
2
],
"use_bias": true,
"use_stochastic_duration_prediction": true,
"vocab_size": 44,
"wavenet_dilation_rate": 1,
"wavenet_dropout": 0.0,
"wavenet_kernel_size": 5,
"window_size": 4
}
{
"feature_extractor_type": "VitsFeatureExtractor",
"feature_size": 80,
"hop_length": 256,
"max_wav_value": 32768.0,
"n_fft": 1024,
"padding_side": "right",
"padding_value": 0.0,
"return_attention_mask": false,
"sampling_rate": 16000
}
{
"pad_token": "Ù„",
"unk_token": "<unk>"
}
{
"add_blank": true,
"added_tokens_decoder": {
"0": {
"content": "\u0644",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
},
"44": {
"content": "<unk>",
"lstrip": false,
"normalized": false,
"rstrip": false,
"single_word": false,
"special": true
}
},
"clean_up_tokenization_spaces": true,
"is_uroman": true,
"language": "fas",
"model_max_length": 1000000000000000019884624838656,
"normalize": true,
"pad_token": "\u0644",
"phonemize": false,
"tokenizer_class": "VitsTokenizer",
"unk_token": "<unk>",
"verbose": false
}
{
" ": 30,
"'": 40,
"-": 35,
"_": 6,
"ء": 14,
"آ": 34,
"أ": 3,
"ؤ": 36,
"ئ": 4,
"ا": 41,
"ب": 26,
"ت": 25,
"ث": 20,
"ج": 8,
"ح": 18,
"خ": 32,
"د": 28,
"ذ": 5,
"ر": 9,
"ز": 43,
"س": 39,
"ش": 31,
"ص": 16,
"ض": 42,
"ط": 19,
"ظ": 24,
"ع": 2,
"غ": 29,
"ف": 11,
"ق": 17,
"ك": 7,
"ل": 0,
"م": 23,
"ن": 22,
"ه": 10,
"و": 27,
"ي": 13,
"ٔ": 21,
"پ": 1,
"چ": 37,
"ژ": 38,
"ک": 33,
"گ": 12,
"ی": 15
}
Among these json files, added_tokens.json, config.json and preprocessor_config.json files are in ascii format while rest of them are in utf-8 format.
Update The error raised because of wrong setting of "model_name_or_path" parameter in the "persian_tts_train_conf.json" file. the value of "model_name_or_path" must be a repo name or its path. for my language, Persian (Farsi), it is "facebook/mms-tts-fas". So, I create model folder which contains model itself and relevant json files in the "pytorch_dump_folder_path" using repo name with this command:
python convert_original_discriminator_checkpoint.py --language_code fas --pytorch_dump_folder_path ./facebook/mms-tts-fas
also, i add "uroman_path" to "persian_tts_train_conf.json" and set "is_uroman" parameter of "tokenizer_config.json" file to True according to my language.
Hi! I am trying to train a TTS model for Persian language. I followed the instructions you posted on the main page (README). However I get this error and I can't fix it. In my opinion, these errors are related to the encoding format of the json files that are created in the "pytorch_dump_folder_path" folder. I tried to convert all of them to utf-8 encoding format, but because some of them contain ascii characters, they remained in the same ascii encoding format. In my research, I kept coming across this sentence that said:
Of course, I'm not sure if the errors I'm getting are necessarily related to this issue or not, but anyway, I'd be happy if you could help.