Hi all, I am quite new to audio processing and I am working on a project that another dev created that is using sounddevice to stream some audio. We want the latency to be quite minimal between when the audio is generated and when it is streamed. However, it seems like the current method is distorting the audio playback, because if we save the audio instead of streaming it the audio locally it seems to be much better quality
import time
import scipy.io.wavfile
import requests
import sounddevice as sd
import numpy as np
def run(app: App, input_queue: queue.Queue):
log = logging.getLogger("tts")
buffer = []
def buffer_duration():
rv = 0.0
for b in buffer:
rv += len(b) / 24000
return rv
def callback(outdata, frames, time, status):
# audio frame callback, play from buffer or silence if buffer is empty
if len(buffer) == 0:
outdata[:] = np.zeros((frames, 1))
return
# fill up outdata with samples in buffer array
while len(buffer) > 0 and frames > 0:
# if we have enough frames in the buffer chunk, fill outdata with and truncate the buffer chunk
if len(buffer[0]) > frames:
outdata[:frames] = buffer[0][:frames]
buffer[0] = buffer[0][frames:]
frames = 0
# if we don't have enough frames in the buffer chunk, fill outdata with the buffer chunk and remove it from the buffer
else:
outdata[: len(buffer[0])] = buffer[0]
outdata = outdata[len(buffer[0]) :]
frames -= len(buffer[0])
buffer.pop(0)
# if we still have frames left to fill, fill them with silence
if frames > 0:
outdata[:frames] = np.zeros((frames, 1))
def generate(text: str, voice_settings: dict):
data = get_audio()
sample_rate, samples = scipy.io.wavfile.read(io.BytesIO(data))
assert sample_rate == 24000
assert len(samples.shape) == 1
samples = samples.reshape((len(samples), 1))
buffer.append(samples)
stream = sd.OutputStream(
channels=1, callback=callback, samplerate=24000, blocksize=2000,
)
stream.start()
text_buffer = []
stop = False
while True:
try:
item = input_queue.get(timeout=0.1)
if item is None:
log.info("Got stop signal.")
stop = True
else:
text_buffer.append(item)
except queue.Empty:
pass
dur = buffer_duration()
app.is_speaking = dur > 0.5
if len(text_buffer) > 0 and (dur < 4 or stop):
# get all text buffer chunks as long as the settings are the same
text = ""
voice_settings = text_buffer[0][1]
while len(text_buffer) > 0 and text_buffer[0][1] == voice_settings:
text += text_buffer.pop(0)[0].strip() + " "
generate(text.strip(), voice_settings)
if stop:
# wait for buffer to be empty
while len(buffer) > 0:
time.sleep(0.1)
stream.stop()
break
Hi all, I am quite new to audio processing and I am working on a project that another dev created that is using sounddevice to stream some audio. We want the latency to be quite minimal between when the audio is generated and when it is streamed. However, it seems like the current method is distorting the audio playback, because if we save the audio instead of streaming it the audio locally it seems to be much better quality
We believe this is due to how we are copying the buffer in the callback function but I am not sure what is being done wrong. I've read the docs and I tried copying the approach in this example https://python-sounddevice.readthedocs.io/en/0.3.15/_downloads/47cc99d1f4d4b5d914d6a05da8e75553/play_long_file.py from the docs instead but I was unable to get it to work. Any insight would be appreciated, thank you!!