Open smallsudarshan opened 5 months ago
Ok so here you go. I picked the code for training from this repo.
train.py
:
import torch
import wandb
from models.dvae import DiscreteVAE
from utils.arch_utils import TorchMelSpectrogram
from torch.utils.data import DataLoader
from utils.dvae_dataset import DVAEDataset
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
import pdb
from TTS.tts.datasets import load_tts_samples
from TTS.config.shared_configs import BaseDatasetConfig
dvae_checkpoint = '/home/ubuntu/test_tts/SimpleTTS/xtts/run/training/XTTS_v2.0_original_model_files/dvae.pth'
mel_norm_file = '/home/ubuntu/test_tts/SimpleTTS/xtts/run/training/XTTS_v2.0_original_model_files/mel_stats.pth'
config_dataset = BaseDatasetConfig(
formatter="ljspeech",
dataset_name="ljspeech",
path="/home/ubuntu/test_tts/sapien-formatted-english-22050",
meta_file_train="/home/ubuntu/test_tts/sapien-formatted-english-22050/metadata_norm.txt",
language="en",
)
# Add here the configs of the datasets
DATASETS_CONFIG_LIST = [config_dataset]
GRAD_CLIP_NORM = 0.5
LEARNING_RATE = 5e-05
dvae = DiscreteVAE(
channels=80,
normalization=None,
positional_dims=1,
num_tokens=1024,
codebook_dim=512,
hidden_dim=512,
num_resnet_blocks=3,
kernel_size=3,
num_layers=2,
use_transposed_convs=False,
)
dvae.load_state_dict(torch.load(dvae_checkpoint), strict=False)
dvae.cuda()
opt = Adam(dvae.parameters(), lr = LEARNING_RATE)
torch_mel_spectrogram_dvae = TorchMelSpectrogram(
mel_norm_file=mel_norm_file, sampling_rate=22050
).cuda()
train_samples, eval_samples = load_tts_samples(
DATASETS_CONFIG_LIST,
eval_split=True,
eval_split_max_size=256,
eval_split_size=0.01,
)
eval_dataset = DVAEDataset(eval_samples, 22050, True)
train_dataset = DVAEDataset(train_samples, 22050, False)
epochs = 20
eval_data_loader = DataLoader(
eval_dataset,
batch_size=3,
shuffle=False,
drop_last=False,
collate_fn=eval_dataset.collate_fn,
num_workers=0,
pin_memory=False,
)
train_data_loader = DataLoader(
train_dataset,
batch_size=3,
shuffle=False,
drop_last=False,
collate_fn=train_dataset.collate_fn,
num_workers=4,
pin_memory=False,
)
torch.set_grad_enabled(True)
dvae.train()
wandb.init(project = 'train_dvae')
wandb.watch(dvae)
def to_cuda(x: torch.Tensor) -> torch.Tensor:
if x is None:
return None
if torch.is_tensor(x):
x = x.contiguous()
if torch.cuda.is_available():
x = x.cuda(non_blocking=True)
return x
@torch.no_grad()
def format_batch(batch):
if isinstance(batch, dict):
for k, v in batch.items():
batch[k] = to_cuda(v)
elif isinstance(batch, list):
batch = [to_cuda(v) for v in batch]
try:
batch['mel'] = torch_mel_spectrogram_dvae(batch['wav'])
# if the mel spectogram is not divisible by 4 then input.shape != output.shape
# for dvae
remainder = batch['mel'].shape[-1] % 4
if remainder:
batch['mel'] = batch['mel'][:, :, :-remainder]
except NotImplementedError:
pass
return batch
for i in range(epochs):
for cur_step, batch in enumerate(train_data_loader):
opt.zero_grad()
batch = format_batch(batch)
recon_loss, commitment_loss, out = dvae(batch['mel'])
total_loss = recon_loss + commitment_loss
total_loss.backward()
clip_grad_norm_(dvae.parameters(), GRAD_CLIP_NORM)
opt.step()
log = {'epoch': i,
'cur_step': cur_step,
'loss': total_loss.item(),
'recon_loss': recon_loss.item(),
'commit_loss': commitment_loss.item()}
print(f"epoch: {i}", print(f"step: {cur_step}"), f'loss - {total_loss.item()}', f'recon_loss - {recon_loss.item()}', f'commit_loss - {commitment_loss.item()}')
wandb.log(log)
torch.cuda.empty_cache()
# if i%10:
# save_model(f'.dvae.pth')
# wandb.save('./dvae.pth')
# wandb.finish()
Wrote a custom DVAEDataset
that is imported in the above train.py
file.
import torch
import random
from utils.dataset import key_samples_by_col
from TTS.tts.models.xtts import load_audio
torch.set_num_threads(1)
class DVAEDataset(torch.utils.data.Dataset):
def __init__(self, samples, sample_rate, is_eval):
self.sample_rate = sample_rate
self.is_eval = is_eval
self.max_wav_len = 255995
self.samples = samples
self.training_seed = 1
self.failed_samples = set()
if not is_eval:
random.seed(self.training_seed)
# random.shuffle(self.samples)
random.shuffle(self.samples)
# order by language
self.samples = key_samples_by_col(self.samples, "language")
print(" > Sampling by language:", self.samples.keys())
else:
# for evaluation load and check samples that are corrupted to ensures the reproducibility
self.check_eval_samples()
def check_eval_samples(self):
print(" > Filtering invalid eval samples!!")
new_samples = []
for sample in self.samples:
try:
_, wav = self.load_item(sample)
except:
continue
# Basically, this audio file is nonexistent or too long to be supported by the dataset.
if (
wav is None
or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
):
continue
new_samples.append(sample)
self.samples = new_samples
print(" > Total eval samples after filtering:", len(self.samples))
def load_item(self, sample):
audiopath = sample["audio_file"]
wav = load_audio(audiopath, self.sample_rate)
if wav is None or wav.shape[-1] < (0.5 * self.sample_rate):
# Ultra short clips are also useless (and can cause problems within some models).
raise ValueError
return audiopath, wav
def __getitem__(self, index):
if self.is_eval:
sample = self.samples[index]
sample_id = str(index)
else:
# select a random language
lang = random.choice(list(self.samples.keys()))
# select random sample
index = random.randint(0, len(self.samples[lang]) - 1)
sample = self.samples[lang][index]
# a unique id for each sampel to deal with fails
sample_id = lang + "_" + str(index)
# ignore samples that we already know that is not valid ones
if sample_id in self.failed_samples:
# call get item again to get other sample
return self[1]
# try to load the sample, if fails added it to the failed samples list
try:
audiopath, wav = self.load_item(sample)
except:
self.failed_samples.add(sample_id)
return self[1]
# check if the audio and text size limits and if it out of the limits, added it failed_samples
if (
wav is None
or (self.max_wav_len is not None and wav.shape[-1] > self.max_wav_len)
):
# Basically, this audio file is nonexistent or too long to be supported by the dataset.
# It's hard to handle this situation properly. Best bet is to return the a random valid token and skew the dataset somewhat as a result.
self.failed_samples.add(sample_id)
return self[1]
res = {
"wav": wav,
"wav_lengths": torch.tensor(wav.shape[-1], dtype=torch.long),
"filenames": audiopath,
}
return res
def __len__(self):
if self.is_eval:
return len(self.samples)
return sum([len(v) for v in self.samples.values()])
def collate_fn(self, batch):
# convert list of dicts to dict of lists
B = len(batch)
batch = {k: [dic[k] for dic in batch] for k in batch[0]}
# stack for features that already have the same shape
batch["wav_lengths"] = torch.stack(batch["wav_lengths"])
max_wav_len = batch["wav_lengths"].max()
# create padding tensors
wav_padded = torch.FloatTensor(B, 1, max_wav_len)
# initialize tensors for zero padding
wav_padded = wav_padded.zero_()
for i in range(B):
wav = batch["wav"][i]
wav_padded[i, :, : batch["wav_lengths"][i]] = torch.FloatTensor(wav)
batch["wav"] = wav_padded
return batch
This trains the DVAE to encode and decode mel-spectograms.
Few things:
DiscretizationLoss
here but I am not sure where or how this is used? So I am not using it currently.dvae.py
on line 378
, the author has added self.loss_fn(img, out, reduction="none")
. I am not sure what is the purpose of doing reduction='none'
. So I have summed it up in my code and just added it to calculate loss. Next step would be to fine-tune a larger dataset. @erogol @eginhard if this is in the right direction, I can convert this into a training recipe and add to the repo.
PS: The code is a bit dirty since I have just re-used whatever was available as long as it doesn't harm my training.
I also now understand that the decoder of DVAE is not used, but instead an LM head is used on the GPT-2 to recompute the mel from the audio-codes. Need to understand this a bit better before writing the next stage training code.
Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ?
dvae
May I ask a question haha, to train the dvae model, is it only necessary to use the features of the audio file? Text is not needed?
dvae
May I ask a question haha, to train the dvae model, is it only necessary to use the features of the audio file? Text is not needed?
yes.
Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ?
Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets.
In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:
We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.
Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ?
Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets.
In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:
- Consistent and clear pronunciations - eg. if randomly sometimes your word has high pitch, high speed and sometimes low, then it will not work -> but however, if there is a pattern, for eg. words before a comma are slow and words after comma are fast, it will pick it up.
- Commas match pauses - if you don't have commas where you have pauses, then it will randomly pause/generate gibberish when it sees a comma etc. If you don't have commas, your speech will seem monotonous/bland.
We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.
I must say, you are very meticulous, kudos to you! Hasn't coqui-ai shut down? With no one maintaining it, I'm currently putting into practice the inspiration you provided. With a large amount of data, it still has a significant effect; moreover, training the dvae is just the first phase. After training is complete, we use this new dvae model to continue to the second phase: training the GPT model, followed by the third phase of training Hifi. I think that in the absence of peer review, we could team up to put this into practice, then report on progress and any issues that may arise, and work together to solve them. I'm not sure if you have WeChat (or any other social media), but I've started some discussion groups to explore each other's practical experiences and to pioneer together.
Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ?
Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets. In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:
- Consistent and clear pronunciations - eg. if randomly sometimes your word has high pitch, high speed and sometimes low, then it will not work -> but however, if there is a pattern, for eg. words before a comma are slow and words after comma are fast, it will pick it up.
- Commas match pauses - if you don't have commas where you have pauses, then it will randomly pause/generate gibberish when it sees a comma etc. If you don't have commas, your speech will seem monotonous/bland.
We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.
I must say, you are very meticulous, kudos to you! Hasn't coqui-ai shut down? With no one maintaining it, I'm currently putting into practice the inspiration you provided. With a large amount of data, it still has a significant effect; moreover, training the dvae is just the first phase. After training is complete, we use this new dvae model to continue to the second phase: training the GPT model, followed by the third phase of training Hifi. I think that in the absence of peer review, we could team up to put this into practice, then report on progress and any issues that may arise, and work together to solve them. I'm not sure if you have WeChat (or any other social media), but I've started some discussion groups to explore each other's practical experiences and to pioneer together.
my wechat: pineking, we can discuss the training questions.
Awesome! Amazing! Did you implement the stage 'Finally fine-tune end to end with the Hi-Fi GAN' ?
Hey @ScottishFold007 unfortunately no, we have been experimenting with fine-tuning just the GPT2 model with larger and much more accurately annotated custom datasets. In case you are facing quality issues, my suggestion would be to focus a lot on the dataset, it really helped us drastically improve quality. Particularly:
- Consistent and clear pronunciations - eg. if randomly sometimes your word has high pitch, high speed and sometimes low, then it will not work -> but however, if there is a pattern, for eg. words before a comma are slow and words after comma are fast, it will pick it up.
- Commas match pauses - if you don't have commas where you have pauses, then it will randomly pause/generate gibberish when it sees a comma etc. If you don't have commas, your speech will seem monotonous/bland.
We are yet to pick up training for the other stages, it's in my to-do list. I just deprioritized it a bit since I did not get any response either from the repo owners or someone who has previously contributed to this. And I did not want to build something that might mislead people by implementing the wrong thing without peer review.
I must say, you are very meticulous, kudos to you! Hasn't coqui-ai shut down? With no one maintaining it, I'm currently putting into practice the inspiration you provided. With a large amount of data, it still has a significant effect; moreover, training the dvae is just the first phase. After training is complete, we use this new dvae model to continue to the second phase: training the GPT model, followed by the third phase of training Hifi. I think that in the absence of peer review, we could team up to put this into practice, then report on progress and any issues that may arise, and work together to solve them. I'm not sure if you have WeChat (or any other social media), but I've started some discussion groups to explore each other's practical experiences and to pioneer together.
my wechat: pineking, we can discuss the training questions.
好的,加你了
@ScottishFold007 @pineking unfortunately I don't use wechat. Maybe we can connect on discord?
There is this repository https://github.com/idiap/coqui-ai-TTS -> where they are maintaining a new pip package for TTS. I had asked the author if they would consider merging something like this, and he said he would, if we are able to replicate the TTS model from scratch.
Also, currently I have 2-3 projects running, so not sure if I will move on this with speed, but happy to connect and contribute in any way I can every now and then.
@smallsudarshan Hi, thank you for the code, I put everything in one place and made it easier for someone who will want to do a DVAE finetune, https://github.com/daswer123/xtts-finetune-tests/tree/main/dvae-finetune
@daswer123 thanks a lot for picking up the baton!
Few things I have observed:
One of the ways to make the model more robust in this to change the training recipe a bit. Currently the ljspeech data loader completely ignores speaker information.
During training, the same sample is giving to the perceiver that needs to be synthesized. What if instead, we keep the speaker (and if applicable other characteristics like emotion) the same but use a sample with different spoken content?
That way, the model might learn that it is the style from the speaker that has to be picked and it might also work a bit better for out-of-distribution (not sure though).
If this has to truly work, it needs to have explicit separate vectors maybe that represent emotion and speaker info?
Point 2 is a bit of a deviation from the XTTS architecture, but point 1 seems simple to implement.
@ScottishFold007 @pineking
@smallsudarshan @daswer123 If you looking for Hifigan XTTS training code. You can checkout this: https://github.com/tuanh123789/Train_Hifigan_XTTS
@tuanh123789 Wow, thanks, it turns out we have the ability to fine-tune each component for XTTS.
and can you tell me approximately how fine-tuning will affect the result, can we train on multiple speakers?
And how do you think pipelines when we train one voice through all stages: DVAE -> GPT-2 -> HifiGAN , this should give a much better result than fine tuning GPT-2
I experiment with Ljspeech dataset both finetune and train from scratch and output very promising. With vietnamese I use 80h. Sure we can train on multi speakers
One problem with finetune GPT part. The short text audio output is very bad, do you solve it @daswer123
@tuanh123789 Yeah, I noticed that, too. Unfortunately, I haven't found a solution yet.
@smallsudarshan @daswer123 you never have to train a dvae , for finetuning only tune gpt-2 plus hifigan for finetung on larger datasets, dvae works for every langauge, you can even use a pretrained tortoise dvae.
for a shorter text it's data problem , add enough short sentences and it'll work.
for a shorter text it's data problem , add enough short sentences and it'll work.
thanks for response. After finetune gpt part with normal data, I use extra corpus about 11h of short text-audio to finetune one more time. But the results is not improve
@manmay-nakhashi I'm not really familiar with all the processes and maybe I don't understand something, but why is fine tuning DVAE and then passing it to GPT-2 not necessary, wouldn't pre-training DVAE on the training dataset give GPT-2 a better view of the dataset?
@daswer123 dvae is universal, can adapt to any language , it just learns how to compress a spectrogram.
It's true, I implement training Dvae pipeline for Vietnamese, but the results is quite the same when using pretrain on other languages. But the short text after finetune gpt is the problem
@tuanh123789 it's a data problem add lot's of single word and short sentences.
Yeah, Let's try
I also now understand that the decoder of DVAE is not used, but instead an LM head is used on the GPT-2 to recompute the mel from the audio-codes. Need to understand this a bit better before writing the next stage training code.
@manmay-nakhashi @tuanh123789 is the dvae even being used? I had checked it sometime back and I don't think it was being used.
And yes, short text is just a simple data problem.
One more problem - I have also seen short audio spikes at the end of speech, not sure how to solve it, but can probably be post-processed.
for a shorter text it's data problem , add enough short sentences and it'll work.
thanks for response. After finetune gpt part with normal data, I use extra corpus about 11h of short text-audio to finetune one more time. But the results is not improve
@tuanh123789 did you try mix training? Sequential had not given great results for us.
I also now understand that the decoder of DVAE is not used, but instead an LM head is used on the GPT-2 to recompute the mel from the audio-codes. Need to understand this a bit better before writing the next stage training code.
@manmay-nakhashi @tuanh123789 is the dvae even being used? I had checked it sometime back and I don't think it was being used.
And yes, short text is just a simple data problem.
One more problem - I have also seen short audio spikes at the end of speech, not sure how to solve it, but can probably be post-processed.
Dvae use to create audio token to train gpt part
I also now understand that the decoder of DVAE is not used, but instead an LM head is used on the GPT-2 to recompute the mel from the audio-codes. Need to understand this a bit better before writing the next stage training code.
@manmay-nakhashi @tuanh123789 is the dvae even being used? I had checked it sometime back and I don't think it was being used.
And yes, short text is just a simple data problem.
One more problem - I have also seen short audio spikes at the end of speech, not sure how to solve it, but can probably be post-processed.
Do you solve the short text problem after finetune?
@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here.
Can you share the part in the code where the DVAE is being imported and used?
@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here.
Can you share the part in the code where the DVAE is being imported and used?
Hello, did you train GPT part of xtts using multi-gpu? for some reason DDP in coqui's codes doesn't work properly, did you face the same issue?
@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here. Can you share the part in the code where the DVAE is being imported and used?
Hello, did you train GPT part of xtts using multi-gpu? for some reason DDP in coqui's codes doesn't work properly, did you face the same issue?
Yes i use 8 GPU to train GPT part and succesful. What problem do you get?
@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here. Can you share the part in the code where the DVAE is being imported and used?
Hello, did you train GPT part of xtts using multi-gpu? for some reason DDP in coqui's codes doesn't work properly, did you face the same issue?
Yes i use 8 GPU to train GPT part and succesful. What problem do you get?
Do you use num_workers> 0 in dataloader?
I get those gpu load graphs with DDP (gpu0 purple, gpu1 green - all the rest GPUs behave the same) than training hangs, gets stuck
With one GPU and num_workers > 0 things go the same way
it only works with 1 GPU and num_workers=0 in my case
It's probably not a hardware problem, tortoise TTS and some other DDP tunings go well, only coqui's Trainer has those problems
@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here. Can you share the part in the code where the DVAE is being imported and used?
Hello, did you train GPT part of xtts using multi-gpu? for some reason DDP in coqui's codes doesn't work properly, did you face the same issue?
Yes i use 8 GPU to train GPT part and succesful. What problem do you get?
Do you use num_workers> 0 in dataloader?
I get those gpu load graphs with DDP (gpu0 purple, gpu1 green - all the rest GPUs behave the same) than training hangs, gets stuck
With one GPU and num_workers > 0 things go the same way
it only works with 1 GPU and num_workers=0 in my case
It's probably not a hardware problem, tortoise TTS and some other DDP tunings go well, only coqui's Trainer has those problems
Yes I set num_woker > 0. What hardware do you use?
Yes I set num_woker > 0. What hardware do you use?
x6 RTX a6000 48GB, 512GB RAM, 128 amd cores, nvme fast ssds
Yes I set num_woker > 0. What hardware do you use?
did you use standart coqui/TTS code to train? or some kind of fork
I use code provide by coqui
I use code provide by coqui
Can you please tell me, do you use the same command? python -m trainer.distribute --script recipes/ljspeech/xtts_v2/train_gpt_xtts.py --gpus 0,1,2,3,4,5
maybe it's the problem
@tuanh123789 it certainly gets better when you try mix training (small and large sentences together), use a large set of single words. We are still testing this. Will post here.
Can you share the part in the code where the DVAE is being imported and used?
Can you provide sentences length ratio in training dataset. You said that adding single words during training. But in the code there is a section that removes audio segments < 0.5s
you can reduce that to 0.3 may be if you want to just add hi, hello etc.
you can reduce that to 0.3 may be if you want to just add hi, hello etc.
Thank you 🤗
Hi @tuanh123789, have you overcome the short text error yet?
Yes. Add more short sentences. And config min_condition_length in train smaller
My thanks
Yes. Add more short sentences. And config min_condition_length in train smaller
As I understand it, min_condition_length is only related to the reference audio. So how does it address the short text problem?
Yes add more short audio and this config will solve the problem
Yes add more short audio and this config will solve the problem
Can you provide me with more information about the number of hours of short audio and the specific min_condition_length value to achieve good results?
Yes add more short audio and this config will solve the problem
Can you provide me with more information about the number of hours of short audio and the specific min_condition_length value to achieve good results?
Finetune Dvae with your data :D
@tuanh123789 hi, can you share what changes did you make to the training code to enable fine-tuning on Vietnamese data?
🚀 Feature Description Hey, we saw that there is no training code for fine-tuning all parts of XTTS V2. We would like to contribute if it adds value.
The aim can be to make it work very reliably on a particular accent [Indian for eg.], in a particular language[English], in a particular speaking style with very little variability. We tried simply fine-tuning and it seems like it learns the accent somewhat and the speaking style, but is not super robust and mispronounces quite a lot.
Solution
We are not sure if the perceiver needs any fine-tuning.
If licenses permit, we will also share the data.
Does this make sense?