Summary:
SileroTTS will fail if the input is too long. Testing showed approx 900+ chars was too long.
Definition done:
Inputs to speaker are evaled for length, chops it between words, creates separate audio files, then re-combines them after processing and outputs one wav file.
Error:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ <torch_package_1>.multi_acc_v3_package.py:338 in apply_tts │
│ │
│ /home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
File ".data/ts_code/code/__torch__/silero_vocoder/jit_model/___torch_mangle_138.py", line 29, in forward
_3 = torch.to(durs_rate, torch.device(device))
_4 = torch.to(pitch_coefs, torch.device(device))
_5 = (tts_model).forward(_1, _2, sr, symb_durs, _3, _4, gt_durs, gt_pitch, device, )
~~~~~~~~~~~~~~~~~~ <--- HERE
audio, audio_lengths, = _5
return (audio, audio_lengths)
File ".data/ts_code/code/__torch__/silero_vocoder/jit_model/___torch_mangle_137.py", line 153, in forward
pitch_hat = unchecked_cast(Tensor, gt_pitch)
tacotron = self.tacotron
mel_outputs = (tacotron).forward(sequence, speaker_ids, orig_mask, dur_hat, pitch_hat, )
~~~~~~~~~~~~~~~~~ <--- HERE
if torch.__isnot__(symb_durs4, None):
symb_durs20 = unchecked_cast(Dict[int, int], symb_durs4)
File ".data/ts_code/code/__torch__/jit_forward_model/___torch_mangle_132.py", line 43, in forward
encoder_outputs_expanded = (len_reg).forward(cond_encoder_outputs0, dur_hat, )
decoder = self.decoder
outputs_expanded = (decoder).forward(encoder_outputs_expanded, None, )
~~~~~~~~~~~~~~~~ <--- HERE
lin = self.lin
mel_outputs = (lin).forward(outputs_expanded, )
File ".data/ts_code/code/__torch__/tacotron2/fastpitch_layers.py", line 118, in forward
x8 = torch.transpose(x, 0, 1)
pos_encoder = self.pos_encoder
x9 = (pos_encoder).forward(x8, )
~~~~~~~~~~~~~~~~~~~~ <--- HERE
x10 = torch.transpose(x9, 0, 1)
pre_vanilla_layers = self.pre_vanilla_layers
File ".data/ts_code/code/__torch__/tacotron2/fastpitch_layers.py", line 42, in forward
_0 = torch.slice(pe, 0, None, torch.size(x, 0))
_1 = torch.mul(scale, torch.slice(_0, 1))
x7 = torch.add(x, _1)
~~~~~~~~~ <--- HERE
dropout = self.dropout
return (dropout).forward(x7, )
Traceback of TorchScript, original code (most recent call last):
File "../../silero_vocoder/jit_model.py", line 69, in forward
sequence, symb_durs, durs_rate, pitch_coefs = self.merge_batch_model(sentences, break_lens, prosody_rates,
prosody_pitches)
audio, audio_lengths = self.tts_model(sequence=sequence.to(device),
~~~~~~~~~~~~~~ <--- HERE
speaker_ids=speaker_ids.to(device),
sr=sr,
File "../../silero_vocoder/jit_model.py", line 457, in forward
pitch_hat = gt_pitch
mel_outputs = self.tacotron(sequence, speaker_ids, orig_mask, dur_hat, pitch_hat)
~~~~~~~~~~~~~ <--- HERE
if symb_durs is not None and len(symb_durs) > 0:
mel_outputs = self.fx_pauses(mel_outputs, dur_hat, symb_durs)
File "/home/keras/notebook/nvme2/islanna/silero_vocoder/tacotron2/jit_forward_model.py", line 110, in forward
# [B, Lexp, Denc]
outputs_expanded = self.decoder(encoder_outputs_expanded,
~~~~~~~~~~~~ <--- HERE
src_pad_mask=None)
File "../tacotron2/fastpitch_layers.py", line 335, in forward
x = x.transpose(0, 1)
# [L, B, d_m]
x = self.pos_encoder(x)
~~~~~~~~~~~~~~~~ <--- HERE
# [B, L, d_m]
x = x.transpose(0, 1)
File "../tacotron2/fastpitch_layers.py", line 53, in forward
def forward(self, x: torch.Tensor) -> torch.Tensor: # shape: [T, N]
x = x + self.scale * self.pe[:x.size(0), :]
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
return self.dropout(x)
RuntimeError: The size of tensor a (8186) must match the size of tensor b (5000) at non-singleton dimension 0
Summary: SileroTTS will fail if the input is too long. Testing showed approx 900+ chars was too long.
Definition done: Inputs to speaker are evaled for length, chops it between words, creates separate audio files, then re-combines them after processing and outputs one wav file.
Error: