[Bug] VITS training fails with an error

Describe the bug

Hi, I want to fine tune single speaker model to have multi speaker model. I follow the steps described in docs but I got run time error:

File "/home/osama/PycharmProjects/TTS/newenv/lib/python3.9/site-packages/torch/nn/functional.py", line 1852, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected tensor for argument https://github.com/coqui-ai/TTS/pull/1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

I've seen this issue but I can't found intelligible answer.

am I doing anything wrong ? please help me clarify things and thank you in advance

To Reproduce

here is my train.py file:

import os

from trainer import Trainer, TrainerArgs

from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig,CharactersConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

# set experiment paths
output_path = os.path.dirname(os.path.abspath(__file__))
dataset_path = os.path.join(output_path, "../ljspeech/")

# download the dataset if not downloaded
# if not os.path.exists(dataset_path):
#     from TTS.utils.downloaders import download_vctk
#
#     download_vctk(dataset_path)

# define dataset config
dataset_config = BaseDatasetConfig(name="ljspeech", meta_file_train="metadata.txt", path="ljspeech/",language="ar")

# define audio config
# ❗ resample the dataset externally using `TTS/bin/resample.py` and set `resample=False` for faster training
audio_config = BaseAudioConfig(sample_rate=16000, resample=False, do_trim_silence=True, trim_db=23.0)

# define model config
config = GlowTTSConfig(
    batch_size=16,
    eval_batch_size=16,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    precompute_num_workers=4,
    run_eval=True,
    test_delay_epochs=-1,
    epochs=1000,
    #text_cleaner="phoneme_cleaners",
    use_phonemes=False,
    #phoneme_language="en-us",
    #phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    print_step=25,
    print_eval=False,
    mixed_precision=True,
    output_path=output_path,
    datasets=[dataset_config],
    characters=CharactersConfig(
        pad="_-ـ",
        eos="~",
        bos="^«»",
        characters="أإاﺎﺍاﺒبﺐﺏبتﺘﺖﺕتثﺜﺚﺙثجﺠﺞﺝجﺣﺤﺢﺡحﺧﺨﺦﺥخﺪﺬﺫذﺩدﺮﺭرﺰﺯزﺳﺴﺲﺱسﺷﺸﺵﺵص  \nی ک شﺻﺺﺹٰ ض   ئ ؤ\"ﺹ-ـ    ﺿﻀﺾﺽﻃﻄﻂﻁطﻇﻈﻆﻅظﻋﻌﻊﻉعﻏﻐ«» ﻎ   ﻍغ  ﻓﻔﻒﻑ    ف       ﻘﻗﻖﻕقﻛﻜﻚﻙك  ﻟﻠﻞﻝلﻣﻤﻢ    ﻡمﻧﻨ    ﻦﻥ  نﻫﻬﻪ    ﻩهيءﻮﻭوﻳﻴﻲﻱيﺂﺁآﺔﺓةﻰﻯىﺆﺅﺌﺋﺊﺉﺈﺇﻻﻼﻺﻹﻸﻷ ﻶﻵﺊﺉﺄﺃﺂﺁﺀﷲ  ً ٌ ٍ َ ُ ِ ّ ْ ٓ ٔ ٕ١٢٣٤٥٦٧٨٩٪",

        # characters="أإاﺎﺍاﺒبﺐﺏبتﺘﺖﺕتثﺜﺚﺙثجﺠﺞﺝجﺣﺤﺢﺡحﺧﺨﺦﺥخﺪﺬﺫذﺩدﺮﺭرﺰﺯزﺳﺴﺲﺱسﺷﺸﺵﺵصۘۛ☭ __چۚ“”ۖ…–۰ۘ—ۙ۰۹۱\nی ک شﺻﺺﺹٰ ض   ئ ؤ\"ﺹ-ـ    ﺿﻀﺾﺽﻃﻄﻂﻁطﻇﻈﻆﻅظﻋﻌﻊﻉعﻏﻐ«» ﻎ   ﻍغ  ﻓﻔﻒﻑ    ف       ﻘﻗﻖﻕقﻛﻜﻚﻙك  ﻟﻠﻞﻝلﻣﻤﻢ    ﻡمﻧﻨ    ﻦﻥ  نﻫﻬﻪ    ﻩهيءﻮﻭوﻳﻴﻲﻱيﺂﺁآﺔﺓةﻰﻯىﺆﺅﺌﺋﺊﺉﺈﺇﻻﻼﻺﻹﻸﻷ ﻶﻵﺊﺉﺄﺃﺂﺁﺀﷲ  ً ٌ ٍ َ ُ ِ ّ ْ ٓ ٔ ٕ١٢٣٤٥٦٧٨٩٪",
        punctuations="!)(٠.,?،:;؍؟؛‎"
    )
    ,
    use_speaker_embedding=True,
    min_text_len=0,
    max_text_len=500,
    min_audio_len=0,
    max_audio_len=500000,
)

# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
ap.sample_rate = 16000
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)

# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
    dataset_config,
    eval_split=True,
    eval_split_max_size=config.eval_split_max_size,
    eval_split_size=config.eval_split_size,
)

# init speaker manager for multi-speaker training
# it maps speaker-id to speaker-name in the model and data-loader
speaker_manager = SpeakerManager()
speaker_manager.set_ids_from_data(train_samples + eval_samples, parse_key="speaker_name")
config.num_speakers = speaker_manager.num_speakers

# init model
model = GlowTTS(config, ap, tokenizer, speaker_manager=speaker_manager)

# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples
)

# AND... 3,2,1... 🚀
trainer.fit()

Command for fine-tuning:

CUDA_VISIBLE_DEVICES="0" python train.py --config_path  pre_trained_model/arabic_models/config.json--restore_path pre_trained_model/arabic_models/best_model_4025258.pth --use_cuda True

Files generated after fine-tuning:

best_model.pth
config.json
events.out.tfevents.1658987099.osama-System-Product-Name
speakers.pth
train.py
trainer_0_log.txt

config.json file:

{
    "output_path": "/home/osama/PycharmProjects/TTS",
    "logger_uri": null,
    "run_name": "run",
    "project_name": null,
    "run_description": "\ud83d\udc38Coqui trainer run.",
    "print_step": 25,
    "plot_step": 100,
    "model_param_stats": false,
    "wandb_entity": null,
    "dashboard_logger": "tensorboard",
    "log_model_step": 10000,
    "save_step": 10000,
    "save_n_checkpoints": 5,
    "save_checkpoints": true,
    "save_all_best": false,
    "save_best_after": 10000,
    "target_loss": null,
    "print_eval": false,
    "test_delay_epochs": -1,
    "run_eval": true,
    "run_eval_steps": null,
    "distributed_backend": "nccl",
    "distributed_url": "tcp://localhost:54321",
    "mixed_precision": true,
    "epochs": 1000,
    "batch_size": 16,
    "eval_batch_size": 16,
    "grad_clip": 5.0,
    "scheduler_after_epoch": true,
    "lr": 0.001,
    "optimizer": "RAdam",
    "optimizer_params": {
        "betas": [
            0.9,
            0.998
        ],
        "weight_decay": 1e-06
    },
    "lr_scheduler": "NoamLR",
    "lr_scheduler_params": {
        "warmup_steps": 4000
    },
    "use_grad_scaler": false,
    "cudnn_enable": true,
    "cudnn_deterministic": false,
    "cudnn_benchmark": false,
    "training_seed": 54321,
    "model": "glow_tts",
    "num_loader_workers": 4,
    "num_eval_loader_workers": 4,
    "use_noise_augment": false,
    "audio": {
        "fft_size": 1024,
        "win_length": 1024,
        "hop_length": 256,
        "frame_shift_ms": null,
        "frame_length_ms": null,
        "stft_pad_mode": "reflect",
        "sample_rate": 22050,
        "resample": false,
        "preemphasis": 0.0,
        "ref_level_db": 20,
        "do_sound_norm": false,
        "log_func": "np.log10",
        "do_trim_silence": true,
        "trim_db": 45,
        "do_rms_norm": false,
        "db_level": null,
        "power": 1.5,
        "griffin_lim_iters": 60,
        "num_mels": 80,
        "mel_fmin": 0.0,
        "mel_fmax": null,
        "spec_gain": 20,
        "do_amp_to_db_linear": true,
        "do_amp_to_db_mel": true,
        "pitch_fmax": 640.0,
        "pitch_fmin": 0.0,
        "signal_norm": true,
        "min_level_db": -100,
        "symmetric_norm": true,
        "max_norm": 4.0,
        "clip_norm": true,
        "stats_path": null
    },
    "use_phonemes": false,
    "phonemizer": null,
    "phoneme_language": null,
    "compute_input_seq_cache": false,
    "text_cleaner": null,
    "enable_eos_bos_chars": false,
    "test_sentences_file": "",
    "phoneme_cache_path": null,
    "characters": {
        "characters_class": "TTS.tts.utils.text.characters.Graphemes",
        "vocab_dict": null,
        "pad": "_-\u0640",
        "eos": "~",
        "bos": "^\u00ab\u00bb",
        "blank": null,
        "characters": "\u0623\u0625\u0627\ufe8e\ufe8d\u0627\ufe92\u0628\ufe90\ufe8f\u0628\u062a\ufe98\ufe96\ufe95\u062a\u062b\ufe9c\ufe9a\ufe99\u062b\u062c\ufea0\ufe9e\ufe9d\u062c\ufea3\ufea4\ufea2\ufea1\u062d\ufea7\ufea8\ufea6\ufea5\u062e\ufeaa\ufeac\ufeab\u0630\ufea9\u062f\ufeae\ufead\u0631\ufeb0\ufeaf\u0632\ufeb3\ufeb4\ufeb2\ufeb1\u0633\ufeb7\ufeb8\ufeb5\ufeb5\u0635\t\n\u06cc \u06a9 \u0634\ufebb\ufeba\ufeb9\u0670 \u0636\t\u0626 \u0624\"\ufeb9-\u0640\t\ufebf\ufec0\ufebe\ufebd\ufec3\ufec4\ufec2\ufec1\u0637\ufec7\ufec8\ufec6\ufec5\u0638\ufecb\ufecc\ufeca\ufec9\u0639\ufecf\ufed0\u00ab\u00bb\t\ufece\t\ufecd\u063a\t\ufed3\ufed4\ufed2\ufed1\t\u0641\t\t\ufed8\ufed7\ufed6\ufed5\u0642\ufedb\ufedc\ufeda\ufed9\u0643\t\ufedf\ufee0\ufede\ufedd\u0644\ufee3\ufee4\ufee2\t\ufee1\u0645\ufee7\ufee8\t\ufee6\ufee5\t\u0646\ufeeb\ufeec\ufeea\t\ufee9\u0647\u064a\u0621\ufeee\ufeed\u0648\ufef3\ufef4\ufef2\ufef1\u064a\ufe82\ufe81\u0622\ufe94\ufe93\u0629\ufef0\ufeef\u0649\ufe86\ufe85\ufe8c\ufe8b\ufe8a\ufe89\ufe88\ufe87\ufefb\ufefc\ufefa\ufef9\ufef8\ufef7\t\ufef6\ufef5\ufe8a\ufe89\ufe84\ufe83\ufe82\ufe81\ufe80\ufdf2  \u064b \u064c \u064d \u064e \u064f \u0650 \u0651 \u0652 \u0653 \u0654 \u0655\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669\u066a",
        "punctuations": "!)(\u0660.,?\u060c:;\u060d\u061f\u061b\u200e",
        "phonemes": null,
        "is_unique": true,
        "is_sorted": true
    },
    "add_blank": false,
    "batch_group_size": 0,
    "loss_masking": null,
    "sort_by_audio_len": false,
    "min_audio_len": 0,
    "max_audio_len": 500000,
    "min_text_len": 0,
    "max_text_len": 500,
    "compute_f0": false,
    "compute_linear_spec": false,
    "precompute_num_workers": 4,
    "start_by_longest": false,
    "datasets": [
        {
            "name": "ljspeech",
            "path": "ljspeech/",
            "meta_file_train": "metadata.txt",
            "ignored_speakers": null,
            "language": "ar",
            "meta_file_val": "",
            "meta_file_attn_mask": ""
        }
    ],
    "test_sentences": [
        "It took me quite a long time to develop a voice, and now that I have it I'm not going to be silent.",
        "Be a voice, not an echo.",
        "I'm sorry Dave. I'm afraid I can't do that.",
        "This cake is great. It's so delicious and moist.",
        "Prior to November 22, 1963."
    ],
    "eval_split_max_size": null,
    "eval_split_size": 0.01,
    "use_speaker_weighted_sampler": false,
    "speaker_weighted_sampler_alpha": 1.0,
    "use_language_weighted_sampler": false,
    "language_weighted_sampler_alpha": 1.0,
    "use_length_weighted_sampler": false,
    "length_weighted_sampler_alpha": 1.0,
    "num_chars": 205,
    "encoder_type": "rel_pos_transformer",
    "encoder_params": {
        "kernel_size": 3,
        "dropout_p": 0.1,
        "num_layers": 6,
        "num_heads": 2,
        "hidden_channels_ffn": 768,
        "input_length": null
    },
    "use_encoder_prenet": true,
    "hidden_channels_enc": 192,
    "hidden_channels_dec": 192,
    "hidden_channels_dp": 256,
    "dropout_p_dp": 0.1,
    "dropout_p_dec": 0.05,
    "mean_only": true,
    "out_channels": 80,
    "num_flow_blocks_dec": 12,
    "inference_noise_scale": 0.0,
    "kernel_size_dec": 5,
    "dilation_rate": 1,
    "num_block_layers": 4,
    "num_speakers": 1097,
    "c_in_channels": 0,
    "num_splits": 4,
    "num_squeeze": 2,
    "sigmoid_scale": false,
    "d_vector_dim": 0,
    "data_dep_init_steps": 10,
    "style_wav_for_test": null,
    "length_scale": 1.0,
    "use_speaker_embedding": true,
    "speakers_file": "/home/osama/PycharmProjects/TTS/speaker_multi_ar_fine/speakers.pth",
    "use_d_vector_file": false,
    "d_vector_file": false,
    "min_seq_len": 3,
    "max_seq_len": 500,
    "r": 1
}

Command for testing the generated model:

tts --text "مرحبا كيف حالكم شكرا لكم على كل ما قدمتموه الى الأن" --config_path speaker_multi_ar_fine/config.json --model_path  speaker_multi_ar_fine/best_model.pth --encoder_path pre_trained_model/tts_models/model_se.pth --encoder_config_path pre_trained_model/tts_models/config_se.json --speakers_file_path speaker_multi_ar_fine/speakers.pth  --out_path output/ar/outar.wav --speaker_wav Tryeng.wav

Expected behavior

Success voice conversion

Logs

No response

Environment

{
    "CUDA": {
        "GPU": [],
        "available": false,
        "version": null
    },
    "Packages": {
        "PyTorch_debug": false,
        "PyTorch_version": "1.7.1+cpu",
        "TTS": "0.7.1",
        "numpy": "1.21.6"
    },
    "System": {
        "OS": "Linux",
        "architecture": [
            "64bit",
            "ELF"
        ],
        "processor": "x86_64",
        "python": "3.9.13",
        "version": "#44-Ubuntu SMP Wed Jun 22 14:20:53 UTC 2022"
    }
}

Additional context

I should point out that I used pre-trained speaker encoder not the same as the generated from the model

here is my trainer log file as well: trainer_0_log.txt

coqui-ai / TTS