suggestion to train model

Hi Dears, Thanks for this project. I train a TTS model on own dataset for Farsi language. I segment an audiobook from one speaker. I create my dataset like ljspeech and use ljspeech preprocessing, using custom symbols #350. The dataset contain 25 hours voice, 12k wave tracks, average time of waves is 5.90 seconds. I train an TTS model follow tacotron2 model and configure in examples and use multiband_melgan vocoder of English ljspeech pretrain model. I use a 1030 (2Gb) gpu for training. I train model until 68k steps. I set batch-size 2 since my gpu memory crash. The decode waves are not good and not understandable here is my alignments pictures. Would you mind suggesting me to improve output waves? (e.g. continue training, train my own vocoder instead of ljspeech vocoder, revise my dataset ,add new voices, use another model like FastSpeech2 [I train a Kaldi model too for alignment], etc). Are there any details about dataset and output, can I share to help?

tensorbord

Screenshot from 2020-12-05 10-40-51

alignment

4000steps_1

perditions output in 68k steps,

b'part23_2_244_252_Track_125-00040000-00043000-4' 0_alignment decode perdition sentence in 68k steps ali_txt_train spect_txt_train The generated wave is not good and not understandable.

decode perdition sentence in 68k steps

ali_txt_test spect_txt_test I use below code to generate wave and pics:


import tensorflow as tf
import yaml
import numpy as np
import matplotlib.pyplot as plt
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import AutoProcessor
from scipy.io.wavfile import write

mb_melgan_config = AutoConfig.from_pretrained('examples/multiband_melgan/conf/multiband_melgan.v1.yaml')
mb_melgan = TFAutoModel.from_pretrained(
    config=mb_melgan_config,
    pretrained_path="examples/multiband_melgan/generator-940000.h5",
    name="mb_melgan"
)

processor = AutoProcessor.from_pretrained(pretrained_path="dump_ljspeech_j2/ljspeech_mapper.json")
def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name):
  input_ids = processor.text_to_sequence(input_text)
  #print(input_ids)
  # text2mel part
  if text2mel_name == "TACOTRON":
    _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference(
        tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        tf.convert_to_tensor([len(input_ids)], tf.int32),
        tf.convert_to_tensor([0], dtype=tf.int32)
    )
  elif text2mel_name == "FASTSPEECH":
    mel_before, mel_outputs, duration_outputs = text2mel_model.inference(
        input_ids=tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
        speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
    )
  elif text2mel_name == "FASTSPEECH2":
    mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference(
        tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
        speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
    )
  else:
    raise ValueError("Only TACOTRON, FASTSPEECH, FASTSPEECH2 are supported on text2mel_name")
  # vocoder part
  if vocoder_name == "MELGAN" or vocoder_name == "MELGAN-STFT":
    audio = vocoder_model(mel_outputs)[0, :, 0]
  elif vocoder_name == "MB-MELGAN":
    audio = vocoder_model(mel_outputs)[0, :, 0]
  else:
    raise ValueError("Only MELGAN, MELGAN-STFT and MB_MELGAN are supported on vocoder_name")
  if text2mel_name == "TACOTRON":
    return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
  else:
    return mel_outputs.numpy(), audio.numpy()

def visualize_attention(alignment_history):
  import matplotlib.pyplot as plt
  fig = plt.figure(figsize=(8, 6))
  ax = fig.add_subplot(111)
  ax.set_title(f'Alignment steps')
  im = ax.imshow(
      alignment_history,
      aspect='auto',
      origin='lower',
      interpolation='none')
  fig.colorbar(im, ax=ax)
  xlabel = 'Decoder timestep'
  plt.xlabel(xlabel)
  plt.ylabel('Encoder timestep')
  plt.tight_layout()
  plt.show()
  plt.close()

def visualize_mel_spectrogram(mels):
  mels = tf.reshape(mels, [-1, 80]).numpy()
  fig = plt.figure(figsize=(10, 8))
  ax1 = fig.add_subplot(311)
  ax1.set_title(f'Predicted Mel-after-Spectrogram')
  im = ax1.imshow(np.rot90(mels), aspect='auto', interpolation='none')
  fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
  plt.show()
  plt.close()

def generate_wave(input_text):
  # setup window for tacotron2 if you want to try
  tacotron2.setup_window(win_front=10, win_back=10)
  mels, alignment_history, audios = do_synthesis( input_text, tacotron2, mb_melgan, "TACOTRON", "MB-MELGAN")
  write('test.wav', 22050, audios)
  visualize_attention(alignment_history[0])
  visualize_mel_spectrogram(mels[0])

tacotron2_config = AutoConfig.from_pretrained('examples/tacotron2/conf/tacotron2.v1.yaml')
num_epoch=68800
txt = 'سلام. از اینکه با شما هستم خوشحالم.'
pretrained_path1="/examples/tacotron2/exp/train.tacotron2.v1/checkpoints/model-"+ str(num_epoch) +".h5"
tacotron2 = TFAutoModel.from_pretrained(
config=tacotron2_config,
pretrained_path=pretrained_path1,
training=False, 
name="tacotron2"
)
generate_wave(txt)

best regards

TensorSpeech / TensorFlowTTS