Closed Naozumi520 closed 10 months ago
I checked the code and speaker_embedding
is defined after it, the heck?
What caused the error? Can you share the code or command?
Here's my code:
import os
# Trainer: Where the ✨️ happens.
# TrainingArgs: Defines the set of arguments of the Trainer.
from trainer import Trainer, TrainerArgs
# GlowTTSConfig: all model related values for training, validating and testing.
from TTS.tts.configs.glow_tts_config import GlowTTSConfig
# BaseDatasetConfig: defines name, formatter and path of the dataset.
from TTS.config.shared_configs import BaseAudioConfig
from TTS.tts.configs.shared_configs import BaseDatasetConfig, CapacitronVAEConfig
from TTS.tts.datasets import load_tts_samples, common_voice
from TTS.tts.models.glow_tts import GlowTTS
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor
# we use the same path as this script as our training folder.
output_path = os.path.dirname(os.path.abspath(__file__))
data_path = "zh-HK/"
# DEFINE DATASET CONFIG
# Set LJSpeech as our target dataset and define its path.
# You can also use a simple Dict to define the dataset and pass it to your custom formatter.
dataset_config = BaseDatasetConfig(
formatter="common_voice",
# language="zh-HK",
meta_file_train="train.tsv",
path=data_path,
)
# Audio files settings
audio_config = BaseAudioConfig(
# sample_rate=16000,
resample=True,
do_trim_silence=True,
trim_db=60.0,
signal_norm=False,
mel_fmin=0.0,
mel_fmax=11025,
spec_gain=1.0,
log_func="np.log",
ref_level_db=20,
preemphasis=0.0,
)
# INITIALIZE THE TRAINING CONFIGURATION
# Configure the model. Every config class inherits the BaseTTSConfig.
config = GlowTTSConfig(
run_name="naozumiCantonese",
audio=audio_config,
batch_size=128, # Tune this to your gpu
max_audio_len=8 * 22050, # Tune this to your gpu
min_audio_len=1 * 22050,
eval_batch_size=16,
num_loader_workers=4,
num_eval_loader_workers=4,
run_eval=False,
test_delay_epochs=-1,
epochs=1000,
text_cleaner="phoneme_cleaners",
use_phonemes=True,
phoneme_language="yue",
phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
phonemizer="espeak",
print_step=25,
print_eval=False,
mixed_precision=False,
lr=1e-3,
lr_scheduler="StepwiseGradualLR",
lr_scheduler_params={
"gradual_learning_rates": [
[0, 1e-3],
[2e4, 5e-4],
[4e5, 3e-4],
[6e4, 1e-4],
[8e4, 5e-5],
]
},
scheduler_after_epoch=False, # scheduler doesn't work without this flag
# Need to experiment with these below for capacitron
loss_masking=False,
output_path=output_path,
datasets=[# `dataset_config` is an instance of the `BaseDatasetConfig` class that defines the
# configuration for the dataset used for training the TTS (Text-to-Speech) model. It
# specifies the name of the dataset, the formatter to be used, and the path to the
# dataset files. In this case, the dataset is LJSpeech, and the metadata file and audio
# files are located in the specified path.
# `dataset_config` is an instance of the `BaseDatasetConfig` class that defines the
# configuration for the dataset used for training the TTS (Text-to-Speech) model. It
# specifies the name of the dataset, the formatter to be used, and the path to the
# dataset files. In this case, the dataset is LJSpeech, and the metadata file and audio
# files are located in the specified path.
dataset_config],
)
# INITIALIZE THE AUDIO PROCESSOR
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)
# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# If characters are not defined in the config, default characters are passed to the config
tokenizer, config = TTSTokenizer.init_from_config(config)
# LOAD DATA SAMPLES
# Each sample is a list of ```[text, audio_file_path, speaker_name]```
# You can define your custom sample loader returning the list of samples.
# Or define your custom formatter and pass it to the `load_tts_samples`.
# Check `TTS.tts.datasets.load_tts_samples` for more details.
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
formatter=common_voice
)
# INITIALIZE THE MODEL
# Models take a config object and a speaker manager as input
# Config defines the details of the model like the number of layers, the size of the embedding, etc.
# Speaker manager is used by multi-speaker models.
model = GlowTTS(config, ap, tokenizer, speaker_manager=None)
# INITIALIZE THE TRAINER
# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,
# distributed training, etc.
trainer = Trainer(
TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples, training_assets={"audio_processor": ap}
)
# AND... 3,2,1... 🚀
trainer.fit()
This error happened while training, also when inference.
Errors says Tacotron your code is GlowTTS. Im confused
Yes I'm confused also D:
This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions. You might also look our discussion channels.
Describe the bug
To Reproduce
Train a tacotron2 model and run it
Expected behavior
No response
Logs
No response
Environment
Additional context
No response