import os
import time
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
print("Loading model...")
config = XttsConfig()
config.load_json("/path/to/xtts/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", use_deepspeed=True)
model.cuda()
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference.wav"])
print("Inference...")
t0 = time.time()
chunks = model.inference_stream(
"It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
"en",
gpt_cond_latent,
speaker_embedding
)
wav_chuncks = []
for i, chunk in enumerate(chunks):
if i == 0:
print(f"Time to first chunck: {time.time() - t0}")
print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
wav_chuncks.append(chunk)
wav = torch.cat(wav_chuncks, dim=0)
torchaudio.save("xtts_streaming.wav", wav.squeeze().unsqueeze(0).cpu(), 24000)
Expected behavior
expect it to save a wav file
Logs
Traceback (most recent call last):
if elements.device.type == "mps" and not is_torch_greater_or_equal_than_2_4:
AttributeError: 'int' object has no attribute 'device'
Try using our fork (available via pip install coqui-tts). This repo is not updated anymore and the streaming code here doesn't work with recent versions of transformers - that's probably the reason, but hard to tell because you didn't include the full error log.
Describe the bug
example code gives error when saving.
To Reproduce
Expected behavior
expect it to save a wav file
Logs
Traceback (most recent call last):
Environment
Additional context
AttributeError: 'int' object has no attribute 'device'