Getting distorted audio playback from stream

Hi all, I am quite new to audio processing and I am working on a project that another dev created that is using sounddevice to stream some audio. We want the latency to be quite minimal between when the audio is generated and when it is streamed. However, it seems like the current method is distorting the audio playback, because if we save the audio instead of streaming it the audio locally it seems to be much better quality

We believe this is due to how we are copying the buffer in the callback function but I am not sure what is being done wrong. I've read the docs and I tried copying the approach in this example https://python-sounddevice.readthedocs.io/en/0.3.15/_downloads/47cc99d1f4d4b5d914d6a05da8e75553/play_long_file.py from the docs instead but I was unable to get it to work. Any insight would be appreciated, thank you!!

python3 -m sounddevice
  0 pulse, ALSA (32 in, 32 out)
  1 default, ALSA (32 in, 32 out)

 >>> import sounddevice as sd
sd.get_portaudio_version())>>> print(sd._libname)
libportaudio.so.2
>>> print(sd.get_portaudio_version())
(1246720, 'PortAudio V19.6.0-devel, revision 396fe4b6699ae929d3a685b3ef8a7e97396139a4')

import time
import scipy.io.wavfile
import requests
import sounddevice as sd
import numpy as np

def run(app: App, input_queue: queue.Queue):

    log = logging.getLogger("tts")
    buffer = []

    def buffer_duration():
        rv = 0.0
        for b in buffer:
            rv += len(b) / 24000
        return rv

    def callback(outdata, frames, time, status):
        # audio frame callback, play from buffer or silence if buffer is empty
        if len(buffer) == 0:
            outdata[:] = np.zeros((frames, 1))
            return
        # fill up outdata with samples in buffer array
        while len(buffer) > 0 and frames > 0:
            # if we have enough frames in the buffer chunk, fill outdata with and truncate the buffer chunk
            if len(buffer[0]) > frames:
                outdata[:frames] = buffer[0][:frames]
                buffer[0] = buffer[0][frames:]
                frames = 0
            # if we don't have enough frames in the buffer chunk, fill outdata with the buffer chunk and remove it from the buffer
            else:
                outdata[: len(buffer[0])] = buffer[0]
                outdata = outdata[len(buffer[0]) :]
                frames -= len(buffer[0])
                buffer.pop(0)
        # if we still have frames left to fill, fill them with silence
        if frames > 0:
            outdata[:frames] = np.zeros((frames, 1))

    def generate(text: str, voice_settings: dict):
        data = get_audio()
        sample_rate, samples = scipy.io.wavfile.read(io.BytesIO(data))
        assert sample_rate == 24000
        assert len(samples.shape) == 1
        samples = samples.reshape((len(samples), 1))
        buffer.append(samples)

    stream = sd.OutputStream(
        channels=1, callback=callback, samplerate=24000, blocksize=2000, 
    )
    stream.start()

    text_buffer = []
    stop = False

    while True:
        try:
            item = input_queue.get(timeout=0.1)
            if item is None:
                log.info("Got stop signal.")
                stop = True
            else:
                text_buffer.append(item)
        except queue.Empty:
            pass

        dur = buffer_duration()
        app.is_speaking = dur > 0.5

        if len(text_buffer) > 0 and (dur < 4 or stop):
            # get all text buffer chunks as long as the settings are the same
            text = ""
            voice_settings = text_buffer[0][1]
            while len(text_buffer) > 0 and text_buffer[0][1] == voice_settings:
                text += text_buffer.pop(0)[0].strip() + " "
            generate(text.strip(), voice_settings)

        if stop:
            # wait for buffer to be empty
            while len(buffer) > 0:
                time.sleep(0.1)
            stream.stop()
            break

spatialaudio / python-sounddevice

Getting distorted audio playback from stream #448