NVIDIA / tacotron2

Tacotron 2 - PyTorch implementation with faster-than-realtime inference
BSD 3-Clause "New" or "Revised" License
4.97k stars 1.37k forks source link

Silence after inference #593

Open maxmelichov opened 1 year ago

maxmelichov commented 1 year ago

Hello, I have an issue after proceeding inference. After training every time when I try to use model and it gives the silent result.

https://colab.research.google.com/drive/12fYCOASNkFwQpr5hGkOx4X_soWLNRLLI#scrollTo=66xcyFEr0lB0

to use it in colab I used this code for hparams.py and fixed the librosa.filters.mel,librosa.util.pad_center in stft.py and layers.py I would like to get any help, Thanks!

import tensorflow as tf
from text import symbols

class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

def create_hparams(hparams_string=None, verbose=False):
    """Create model hyperparameters. Parse nondefault from given string."""

    hparams = AttrDict({
        ################################
        # Experiment Parameters        #
        ################################
        "epochs":1500,
        "iters_per_checkpoint":1000,
        "seed":1234,
        "dynamic_loss_scaling":True,
        "fp16_run":False,
        "distributed_run":False,
        "dist_backend":"nccl",
        "dist_url":"tcp://localhost:14897",
        "cudnn_enabled":True,
        "cudnn_benchmark":False,
        "ignore_layers":['embedding.weight'],
        # freeze_layers":['encoder'], # Freeze tacotron2 layer for finetuning

        ################################
        # Data Parameters             #
        ################################
        "load_mel_from_disk":False,
        "load_phone_from_disk":True,

        "training_files":'',
        "validation_files":'',

        "text_cleaners":['english_cleaners'],

        ################################
        # Audio Parameters             #
        ################################
        "max_wav_value":32768.0,
        "sampling_rate":22050,
        "filter_length":1024,
        "hop_length":256,
        "win_length":1024,
        "n_mel_channels":80,
        "mel_fmin":0.0,
        "mel_fmax":8000.0,

        ################################
        # Model Parameters             #
        ################################
        "n_symbols": 148,
        "symbols_embedding_dim":512,
        "alignloss": "L2",
        "attention": "StepwiseMonotonicAttention",

        # Encoder parameters
        "encoder_kernel_size":5,
        "encoder_n_convolutions":3,
        "encoder_embedding_dim":512,

        # Decoder parameters
        "n_frames_per_step":1,  # currently only 1 is supported
        "decoder_rnn_dim":1024,
        "prenet_dim":256,
        "max_decoder_steps":1000,
        "gate_threshold":0.001,
        "p_attention_dropout":0.1,
        "p_decoder_dropout":0.1,

        # Attention parameters
        "attention_rnn_dim":1024,
        "attention_dim":128,

        # Location Layer parameters
        "attention_location_n_filters":32,
        "attention_location_kernel_size":31,

        # Mel-post processing network parameters
        "postnet_embedding_dim":512,
        "postnet_kernel_size":5,
        "postnet_n_convolutions":5,

        ################################
        # Optimization Hyperparameters #
        ################################
        "use_saved_learning_rate":True,
        "learning_rate":1e-3,
        "weight_decay":1e-6,
        "grad_clip_thresh":1.0,
        "batch_size":32, # each gpus
        "mask_padding":True  # set model's padded outputs to padded values
    })

    if hparams_string:
        hps = hparams_string[1:-2].split("-")
        for hp in hps:
            k,v = hp.split(":")
            if k in hparams:
                hparams[k] = v
                print("Set hparam: " + k + " to " + v)

    return hparams
ndz2011 commented 3 months ago

Try to set remove "half()" when loading model and waveglow