Azure-Samples / cognitive-services-speech-sdk

Sample code for the Microsoft Cognitive Services Speech SDK
MIT License
2.95k stars 1.86k forks source link

Segmentation Fault when I call speak_text_async #2011

Closed sunwoo604 closed 1 year ago

sunwoo604 commented 1 year ago

I am trying to make a code where it gets input from the user on the web and when the message is received, get the tts data from speech_synthesizer using push stream and from each audio buffer, I create audioframe and send it to the web using webrtc. If I call speak_text_async before the connection of audio stream is fully established, it works perfectly but if I call while the stream is added to the track. It says segmentation fault python3 server.py. These are the part of the code of server.py


async def offer(request):
    params = await request.json()
    offer = RTCSessionDescription(sdp=params["sdp"], type=params["type"])

    pc = RTCPeerConnection()
    pc_id = "PeerConnection(%s)" % uuid.uuid4()
    pcs.add(pc)

    def log_info(msg, *args):
        logger.info(pc_id + " " + msg, *args)

    log_info("Created for %s", request.remote)
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    speech_config.set_speech_synthesis_output_format(speechsdk.SpeechSynthesisOutputFormat.Raw48Khz16BitMonoPcm)
    stream_callback = PushAudioOutputStreamSampleCallback()
    audtr = AudioTransformTrack(stream_callback)
    push_stream = speechsdk.audio.PushAudioOutputStream(stream_callback)
    stream_config = speechsdk.audio.AudioOutputConfig(stream=push_stream)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=stream_config)
    text='hi how are you'
    speech_synthesizer.speak_text_async(text).get()

    if args.record_to:
        recorder = MediaRecorder(args.record_to)
    else:
        recorder = MediaBlackhole()
    @pc.on("datachannel")
    def on_datachannel(channel):
        @channel.on("message")
        def on_message(message):
            print(message)
            nonlocal speech_synthesizer
            result = speech_synthesizer.speak_text_async(message).get()
            print(result.reason)

in here, hi how are you works fine but any message recived will cause seg fault.

This is how I constructed stream and stuff


class PushAudioOutputStreamSampleCallback(speechsdk.audio.PushAudioOutputStreamCallback):
    """
    Example class that implements the PushAudioOutputStreamCallback, which is used to show
    how to push output audio to a stream
    """
    def __init__(self) -> None:
        super().__init__()
        self._audio_data = asyncio.Queue()
        self._closed = False

    async def write_async(self, audio_buffer: memoryview):
        """
        The callback function which is invoked when the synthesizer has an output audio chunk
        to write out
        """
        print("recieved")
        for i in range(0,len(audio_buffer),1920):
            if len(audio_buffer[i:i+1920])!=1920:
                print(str(len(audio_buffer[i:i+1920]))+" samples aborted")
                continue
            FORMAT = str(1920//2)+'h'
            samp = audio_buffer[i:i+1920]
            unp = np.array([unpack(FORMAT,samp)]).astype(np.int16)
            samp_f = AudioFrame.from_ndarray(unp,format='s16',layout='mono')
            samp_f.sample_rate = 48000
            await self._audio_data.put(samp_f)

    def write(self, audio_buffer: memoryview) -> int:

        asyncio.run(self.write_async(audio_buffer))

        return audio_buffer.nbytes

    def close(self) -> None:
        """
        The callback function which is invoked when the synthesizer is about to close the
        stream.
        """
        self._closed = True
        print("Push audio output stream closed.")

    def get_audio_data(self) -> bytes:
        return self._audio_data

    def get_audio_size(self) -> int:
        return len(self._audio_data)

    async def pop_audio_buffer(self):
        if self._audio_data.empty():
            return None
        out = await self._audio_data.get()
        return out

class AudioTransformTrack(MediaStreamTrack):

    kind = "audio"

    def __init__(self,buffer):
        super().__init__()  # don't forget this!
        self.buffer = buffer
        self.started = False
        audio = np.zeros((1, 960), dtype=np.int16)
        self.empty_frame = av.AudioFrame.from_ndarray(audio, layout='mono', format='s16')
        self.time_base = fractions.Fraction(1/48000)
        self.idx=0
        self.start_time = None
        self.sample_rate = 48000

    async def recv(self):
        frame = await self.buffer.pop_audio_buffer()
        data_time = float(960*self.idx*self.time_base)
        if frame is None:
            frame = self.empty_frame
        frame.pts = self.idx*960
        frame.sample_rate = 48000
        self.idx+= 1
        if self.start_time is None:
            self.start_time = time.time() - data_time
        else:
            wait = self.start_time + data_time  - time.time()
            await asyncio.sleep(wait)
        print(frame)
        return frame
    def stop(self):
        super().stop()

and this is how my audio stream track is added to the connection


    @pc.on("connectionstatechange")
    async def on_connectionstatechange():
        log_info("Connection state is %s", pc.connectionState)
        if pc.connectionState == "failed":
            await pc.close()
            pcs.discard(pc)

    @pc.on("track")
    def on_track(track):
        log_info("Track %s received", track.kind)
        pc.addTrack(audtr)
        @track.on("ended")
        async def on_ended():
            log_info("Track %s ended", track.kind)
            await recorder.stop()

I am combining examples from this github sources sample/python/console/speech_synthesis_sample.py/ def speech_synthesis_to_push_audio_output_stream() and the example from https://github.com/aiortc/aiortc/tree/main/examples/server. If someone is experienced in this, please help me out

pankopon commented 1 year ago

@yulin-li Please check.

sunwoo604 commented 1 year ago

fixed