Hello,
I have the following problem and thought you might be able to help me:
Describe the bug
When running the application inside a Docker container, an error occurs related to the torch.isin() method within the transformers library. The error does not occur when running the application locally (outside of the container), suggesting a possible incompatibility or issue with the dependencies inside the Docker container.
To Reproduce
Build the Docker image using the provided Dockerfile.
Dockerfile:
FROM python:3.11.8-slim
ENV PYTHONUNBUFFERED=1
# Install system dependencies and Rust
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
curl \
libsndfile1 \
libgomp1 \
pkg-config \
libssl-dev && \
curl https://sh.rustup.rs -sSf | sh -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
ENV COQUI_TOS_AGREED=1
# Update pip to the latest version
RUN pip install --upgrade pip
# Install Python dependencies
RUN pip install --no-cache-dir fastapi uvicorn torch==2.2.0 torchaudio==2.2.0 transformers==4.43.1 numpy==1.24.3 TTS==0.22.0 sudachipy cutlet
RUN pip install --upgrade transformers
# Copy the FastAPI application code
COPY main.py /app/main.py
WORKDIR /app
EXPOSE 8001
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8001"]
main.py:
import io
import os
import wave
import torch
import numpy as np
from fastapi import FastAPI, Request, Header, Body
from fastapi.responses import StreamingResponse
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.generic_utils import get_user_data_dir
from TTS.utils.manage import ModelManager
# Set the number of threads and device
torch.set_num_threads(int(os.environ.get("NUM_THREADS", os.cpu_count())))
device = torch.device("cuda" if torch.cuda.is_available() and os.environ.get("USE_CPU", "0") == "0" else "cpu")
# Load custom model if available, otherwise download the default model
custom_model_path = os.environ.get("CUSTOM_MODEL_PATH", "/app/tts_models")
if os.path.exists(custom_model_path) and os.path.isfile(custom_model_path + "/config.json"):
model_path = custom_model_path
print("Loading custom model from", model_path, flush=True)
else:
print("Loading default model", flush=True)
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
print("Downloading XTTS Model:", model_name, flush=True)
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print("XTTS Model downloaded", flush=True)
# Load model configuration and model
print("Loading XTTS", flush=True)
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=True if device == "cuda" else False)
model.to(device)
print("XTTS Loaded.", flush=True)
# Initialize FastAPI
app = FastAPI(
title="XTTS Streaming server",
description="XTTS Streaming server",
version="0.0.1",
docs_url="/",
)
# Helper functions
def postprocess(wav):
if isinstance(wav, list):
wav = torch.cat(wav, dim=0)
wav = wav.clone().detach().cpu().numpy()
wav = wav[None, : int(wav.shape[0])]
wav = np.clip(wav, -1, 1)
wav = (wav * 32767).astype(np.int16)
return wav
def wav_data_generator(frame_input, sample_rate=24000, sample_width=2, channels=1):
wav_buf = io.BytesIO()
with wave.open(wav_buf, "wb") as vfout:
vfout.setnchannels(channels)
vfout.setsampwidth(sample_width)
vfout.setframerate(sample_rate)
vfout.writeframes(frame_input)
wav_buf.seek(0)
return wav_buf.read()
# Streaming generator
def predict_streaming_generator(text, language, add_wav_header, stream_chunk_size):
speaker_name = "Alison Dietlinde"
speaker_raw = model.speaker_manager.speakers[speaker_name]["speaker_embedding"].cpu().squeeze().half().tolist()
gpt_raw = model.speaker_manager.speakers[speaker_name]["gpt_cond_latent"].cpu().squeeze().half().tolist()
speaker_embedding = torch.tensor(speaker_raw).unsqueeze(0).unsqueeze(-1)
gpt_cond_latent = torch.tensor(gpt_raw).reshape((-1, 1024)).unsqueeze(0)
chunks = model.inference_stream(
text,
language,
gpt_cond_latent,
speaker_embedding,
stream_chunk_size=int(stream_chunk_size),
enable_text_splitting=True
)
for i, chunk in enumerate(chunks):
chunk = postprocess(chunk)
if i == 0 and add_wav_header:
yield wav_data_generator(b"")
yield chunk.tobytes()
else:
yield chunk.tobytes()
# FastAPI endpoint for streaming
@app.post("/tts_stream")
async def predict_streaming_endpoint(
text: str = Header(...),
language: str = Header(...),
add_wav_header: bool = Header(True),
stream_chunk_size: str = Header("20")
):
try:
return StreamingResponse(
predict_streaming_generator(text,language, add_wav_header, stream_chunk_size),
media_type="audio/wav"
)
except Exception as e:
raise
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8001)
Start the Docker container.
Make a POST request to the /tts_stream endpoint with the appropriate headers and data.
test.py:
import argparse
import json
import shutil
import subprocess
import sys
import time
from typing import Iterator
import requests
def is_installed(lib_name: str) -> bool:
lib = shutil.which(lib_name)
if lib is None:
return False
return True
def save(audio: bytes, filename: str) -> None:
with open(filename, "wb") as f:
f.write(audio)
def stream_ffplay(audio_stream, output_file, save=True):
if not save:
ffplay_cmd = ["ffplay", "-nodisp", "-probesize", "1024", "-autoexit", "-"]
else:
print("Saving to ", output_file)
ffplay_cmd = ["ffmpeg", "-probesize", "1024", "-i", "-", output_file]
ffplay_proc = subprocess.Popen(ffplay_cmd, stdin=subprocess.PIPE)
for chunk in audio_stream:
if chunk is not None:
ffplay_proc.stdin.write(chunk)
# close on finish
ffplay_proc.stdin.close()
ffplay_proc.wait()
def tts(text, language, server_url, stream_chunk_size) -> Iterator[bytes]:
start = time.perf_counter()
headers = {
"text": text,
"language": language,
"add_wav_header": "False",
"stream_chunk_size": stream_chunk_size,
}
res = requests.post(
f"{server_url}/tts_stream",
headers=headers,
stream=True
)
end = time.perf_counter()
print(f"Time to make POST: {end-start}s", file=sys.stderr)
if res.status_code != 200:
print("Error:", res.text)
sys.exit(1)
first = True
for chunk in res.iter_content(chunk_size=512):
if first:
end = time.perf_counter()
print(f"Time to first chunk: {end-start}s", file=sys.stderr)
first = False
if chunk:
yield chunk
print("⏱️ response.elapsed:", res.elapsed)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--text",
default="It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
help="text input for TTS"
)
parser.add_argument(
"--language",
default="en",
help="Language to use default is 'en' (English)"
)
parser.add_argument(
"--output_file",
default=None,
help="Save TTS output to given filename"
)
parser.add_argument(
"--ref_file",
default=None,
help="Reference audio file to use, when not given will use default"
)
parser.add_argument(
"--server_url",
default="http://localhost:8000",
help="Server url http://localhost:8000 default, change to your server location "
)
parser.add_argument(
"--stream_chunk_size",
default="20",
help="Stream chunk size , 20 default, reducing will get faster latency but may degrade quality"
)
args = parser.parse_args()
with open("./default_speaker.json", "r") as file:
speaker = json.load(file)
if args.ref_file is not None:
print("Computing the latents for a new reference...")
audio = stream_ffplay(
tts(
args.text,
args.language,
args.server_url,
args.stream_chunk_size
),
args.output_file,
save=bool(args.output_file)
)
CMD:
python test.py --text "This is a Test." --language en --server_url "http://localhost:8001" --stream_chunk_size 145
Expected behavior
No response
Logs
TypeError: isin() received an invalid combination of arguments - got (test_elements=int, elements=Tensor, ), but expected one of:
* (Tensor elements, Tensor test_elements, *, bool assume_unique, bool invert, Tensor out)
* (Number element, Tensor test_elements, *, bool assume_unique, bool invert, Tensor out)
* (Tensor elements, Number test_element, *, bool assume_unique, bool invert, Tensor out)
Hello, I have the following problem and thought you might be able to help me:
Describe the bug
When running the application inside a Docker container, an error occurs related to the torch.isin() method within the transformers library. The error does not occur when running the application locally (outside of the container), suggesting a possible incompatibility or issue with the dependencies inside the Docker container.
To Reproduce
Build the Docker image using the provided Dockerfile.
Dockerfile:
main.py:
Start the Docker container. Make a POST request to the /tts_stream endpoint with the appropriate headers and data. test.py:
CMD:
python test.py --text "This is a Test." --language en --server_url "http://localhost:8001" --stream_chunk_size 145
Expected behavior
No response
Logs
Environment
Additional context
No response