[Bug] Answers can be too long for a single audio file

Summary: SileroTTS will fail if the input is too long. Testing showed approx 900+ chars was too long.
Definition done: Inputs to speaker are evaled for length, chops it between words, creates separate audio files, then re-combines them after processing and outputs one wav file.
Error:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ <torch_package_1>.multi_acc_v3_package.py:338 in apply_tts                                       │
│                                                                                                  │
│ /home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/module.py:1501 in _call_impl    │
│                                                                                                  │
│   1498 │   │   if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks   │
│   1499 │   │   │   │   or _global_backward_pre_hooks or _global_backward_hooks                   │
│   1500 │   │   │   │   or _global_forward_hooks or _global_forward_pre_hooks):                   │
│ ❱ 1501 │   │   │   return forward_call(*args, **kwargs)                                          │
│   1502 │   │   # Do not call functions when jit is used                                          │
│   1503 │   │   full_backward_hooks, non_full_backward_hooks = [], []                             │
│   1504 │   │   backward_pre_hooks = []                                                           │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript, serialized code (most recent call last):
  File ".data/ts_code/code/__torch__/silero_vocoder/jit_model/___torch_mangle_138.py", line 29, in forward
    _3 = torch.to(durs_rate, torch.device(device))
    _4 = torch.to(pitch_coefs, torch.device(device))
    _5 = (tts_model).forward(_1, _2, sr, symb_durs, _3, _4, gt_durs, gt_pitch, device, )
          ~~~~~~~~~~~~~~~~~~ <--- HERE
    audio, audio_lengths, = _5
    return (audio, audio_lengths)
  File ".data/ts_code/code/__torch__/silero_vocoder/jit_model/___torch_mangle_137.py", line 153, in forward
      pitch_hat = unchecked_cast(Tensor, gt_pitch)
    tacotron = self.tacotron
    mel_outputs = (tacotron).forward(sequence, speaker_ids, orig_mask, dur_hat, pitch_hat, )
                   ~~~~~~~~~~~~~~~~~ <--- HERE
    if torch.__isnot__(symb_durs4, None):
      symb_durs20 = unchecked_cast(Dict[int, int], symb_durs4)
  File ".data/ts_code/code/__torch__/jit_forward_model/___torch_mangle_132.py", line 43, in forward
    encoder_outputs_expanded = (len_reg).forward(cond_encoder_outputs0, dur_hat, )
    decoder = self.decoder
    outputs_expanded = (decoder).forward(encoder_outputs_expanded, None, )
                        ~~~~~~~~~~~~~~~~ <--- HERE
    lin = self.lin
    mel_outputs = (lin).forward(outputs_expanded, )
  File ".data/ts_code/code/__torch__/tacotron2/fastpitch_layers.py", line 118, in forward
    x8 = torch.transpose(x, 0, 1)
    pos_encoder = self.pos_encoder
    x9 = (pos_encoder).forward(x8, )
          ~~~~~~~~~~~~~~~~~~~~ <--- HERE
    x10 = torch.transpose(x9, 0, 1)
    pre_vanilla_layers = self.pre_vanilla_layers
  File ".data/ts_code/code/__torch__/tacotron2/fastpitch_layers.py", line 42, in forward
    _0 = torch.slice(pe, 0, None, torch.size(x, 0))
    _1 = torch.mul(scale, torch.slice(_0, 1))
    x7 = torch.add(x, _1)
         ~~~~~~~~~ <--- HERE
    dropout = self.dropout
    return (dropout).forward(x7, )

Traceback of TorchScript, original code (most recent call last):
  File "../../silero_vocoder/jit_model.py", line 69, in forward
        sequence, symb_durs, durs_rate, pitch_coefs = self.merge_batch_model(sentences, break_lens, prosody_rates, 
prosody_pitches)

        audio, audio_lengths = self.tts_model(sequence=sequence.to(device),
                               ~~~~~~~~~~~~~~ <--- HERE
                                              speaker_ids=speaker_ids.to(device),
                                              sr=sr,
  File "../../silero_vocoder/jit_model.py", line 457, in forward
            pitch_hat = gt_pitch

        mel_outputs = self.tacotron(sequence, speaker_ids, orig_mask, dur_hat, pitch_hat)
                      ~~~~~~~~~~~~~ <--- HERE
        if symb_durs is not None and len(symb_durs) > 0:
            mel_outputs = self.fx_pauses(mel_outputs, dur_hat, symb_durs)
  File "/home/keras/notebook/nvme2/islanna/silero_vocoder/tacotron2/jit_forward_model.py", line 110, in forward

        # [B, Lexp, Denc]
        outputs_expanded = self.decoder(encoder_outputs_expanded,
                           ~~~~~~~~~~~~ <--- HERE
                                        src_pad_mask=None)

  File "../tacotron2/fastpitch_layers.py", line 335, in forward
        x = x.transpose(0, 1)
        # [L, B, d_m]
        x = self.pos_encoder(x)
            ~~~~~~~~~~~~~~~~ <--- HERE
        # [B, L, d_m]
        x = x.transpose(0, 1)
  File "../tacotron2/fastpitch_layers.py", line 53, in forward
    def forward(self, x: torch.Tensor) -> torch.Tensor:         # shape: [T, N]
        x = x + self.scale * self.pe[:x.size(0), :]
            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ <--- HERE
        return self.dropout(x)
RuntimeError: The size of tensor a (8186) must match the size of tensor b (5000) at non-singleton dimension 0
JordieB / lippy

[Bug] Answers can be too long for a single audio file #4