deepgram / deepgram-python-sdk

Official Python SDK for Deepgram's automated speech recognition APIs.
https://developers.deepgram.com
MIT License
218 stars 58 forks source link

Issues running Deepgram with fastapi sockets #361

Closed sunil-sopho closed 5 months ago

sunil-sopho commented 5 months ago

What is the current behavior?

Connection breaks after initial meta data is shared What's happening that seems wrong? transcription is not recieved.

Steps to reproduce

# Initialize Deepgram Client
from deepgram import (    DeepgramClient,
    DeepgramClientOptions,
    LiveTranscriptionEvents,
    LiveOptions)
# Set up client configuration
config = DeepgramClientOptions(
    verbose=logger.DEBUG,
    # options={"keepalive": "true"}
)

DEEPGRAM_API_KEY = os.getenv("DEEPGRAM_API_KEY")
deepgram = DeepgramClient(DEEPGRAM_API_KEY,config)
dg_connection = deepgram.listen.live.v("1")

@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
    await websocket.accept()

    try:

        def on_open(self, open, **kwargs):
            print(f"\n\n{open}\n\n")

        def on_message(self, result, **kwargs):
            print(result,flush=True)
            sentence = result.channel.alternatives[0].transcript
            if len(sentence) == 0:
                return
            print(f"speaker: {sentence}",flush=True)

        def on_metadata(self, metadata, **kwargs):
            print(f"\n\n{metadata}\n\n")

        def on_speech_started(self, speech_started, **kwargs):
            print(f"\n\n{speech_started}\n\n",flush=True)

        def on_utterance_end(self, utterance_end, **kwargs):
            print(f"\n\n{utterance_end}\n\n",flush=True)

        def on_close(self, close, **kwargs):
            print(f"\n\n{close}\n\n")

        def on_error(self, error, **kwargs):
            print(f"\n\nERROR:: {error} -- {kwargs}\n\n")

        def on_unhandled(self, unhandled, **kwargs):
            print(f"\n\nUNHANDLED:: {unhandled} -- {kwargs}\n\n")

        dg_connection.on(LiveTranscriptionEvents.Open, on_open)
        dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
        # dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
        # # dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
        # # dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
        # # dg_connection.on(LiveTranscriptionEvents.Close, on_close)
        dg_connection.on(LiveTranscriptionEvents.Error, on_error)
        # dg_connection.on(LiveTranscriptionEvents.Unhandled, on_unhandled)

        # # connect to websocket
        options = LiveOptions(model="nova-2", language="en-US")

        print("\n\nPress Enter to stop recording...\n\n")
        if dg_connection.start(options) is False:
            print("Failed to start connection")
            return

        while True:
            r = await websocket.receive_bytes()
            dg_connection.send(r)

        # Indicate that we've finished
        dg_connection.finish()

        print("Finished",flush=True)

    except Exception as e:
        print(f"Could not open socket: {e}")
        dg_connection.finish()

    finally:
        await websocket.close()

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("code:app", host="0.0.0.0", port=8000, reload=True)

For the client, I am using Streamlit with webrt

import logging
import queue
import numpy as np
import pydub
import streamlit as st
from streamlit_webrtc import WebRtcMode, webrtc_streamer
import librosa
import time

webrtc_ctx = webrtc_streamer(
    key="sendonly-audio",
    mode=WebRtcMode.SENDONLY,
    audio_receiver_size=2048*64,
    media_stream_constraints={"audio": True},
)
def audio_frame_to_np_buffer(frames, target_sample_rate=None, mono=False):
    arrays = []

    for frame in frames:
        num_channels = len(frame.layout.channels)
        np_frame = np.frombuffer(frame.planes[0].to_bytes(), dtype=np.int16).reshape(
            -1, num_channels
        )
        if mono:
            np_frame = np.mean(np_frame, axis=1).astype(np.int16)
        if target_sample_rate is not None and target_sample_rate != frame.sample_rate:
            np_frame = np_frame.astype(np.float32) / 32768
            np_frame = librosa.resample(
                np_frame.T, orig_sr=frame.sample_rate, target_sr=target_sample_rate
            )
            if mono:
                np_frame = np_frame[np.newaxis, :]
            np_frame = (np_frame * 1).astype(np.int16) #32768

        arrays.append(np_frame)

    combined_buffer = np.concatenate(arrays, axis=0)

    return combined_buffer

def send_audio():
    asyncio.run(send_audio_websocket())

async def send_audio_websocket():
    global webrtc_ctx
    uri = "ws://api_server:5000/ws"  # Change this to your WebSocket server URI
    async with websockets.connect(uri) as websocket:
        print(websocket)
        while True:
            if webrtc_ctx.audio_receiver:
                try:
                    audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=0.2)
                except queue.Empty:
                    logger.warning("Queue is empty. Abort.")
                    continue
                # print(audio_frames)
                # for audio_data in audio_frames:
                np_buffer = audio_frame_to_np_buffer(
                    audio_frames, target_sample_rate=16000, mono=True
                )
                # print(np_buffer.tobytes())
                await websocket.send(np_buffer.tobytes())

            else:
                break

Please tell us about your environment

Other information

pi_server  | MetadataResponse: {                                                                                                                                                       
api_server  |     "type": "Metadata",                                                                                                                                                   
api_server  |     "transaction_key": "deprecated",                                                                                                                                      
api_server  |     "request_id": "e303d28b-5cd0-42e9-852e-19a38ad6b45a",                                                                                                                 
api_server  |     "sha256": "eba58d208a46d11bd9aab561ad04cc43a941e7ba7a55f28da064040aab2bd210",                                                                                         
api_server  |     "created": "2024-04-08T19:25:28.765Z",                                                                                                                                
api_server  |     "duration": 0.0,                                                                                                                                                      
api_server  |     "channels": 0                                                                                                                                                         
api_server  | }                                                                                                                                                                         
api_server  | 2024-04-08 19:25:34,466:VERBOSE - MetadataResponse: {                                                                                                                     
api_server  |     "type": "Metadata",                                                                                                                                                   
api_server  |     "transaction_key": "deprecated",                                                                                                                                      
api_server  |     "request_id": "e303d28b-5cd0-42e9-852e-19a38ad6b45a",                                                                                                                 
api_server  |     "sha256": "eba58d208a46d11bd9aab561ad04cc43a941e7ba7a55f28da064040aab2bd210",                                                                                         
api_server  |     "created": "2024-04-08T19:25:28.765Z",                                                                                                                                
api_server  |     "duration": 0.0,                                                                                                                                                      
api_server  |     "channels": 0
api_server  | }                               
api_server  | _listening(1000) exiting gracefully
api_server  | 2024-04-08 19:25:34,470:NOTICE - _listening(1000) exiting gracefully
api_server  | LiveClient._listening LEAVE
api_server  | 2024-04-08 19:25:34,471:DEBUG - LiveClient._listening LEAVE
api_server  | send() exiting gracefully: 1000
api_server  | 2024-04-08 19:25:34,476:NOTICE - send() exiting gracefully: 1000
api_server  | LiveClient._keep_alive LEAVE
api_server  | 2024-04-08 19:25:34,476:DEBUG - LiveClient._keep_alive LEAVE
api_server  | send() exiting gracefully: 1000
api_server  | 2024-04-08 19:25:34,497:NOTICE - send() exiting gracefully: 1000
api_server  | LiveClient._keep_alive LEAVE
dvonthenen commented 5 months ago

Usually when the connection is terminated by the server, it's because either:

I would double check that to make sure that you are sending audio data and the encoding matches.

dvonthenen commented 5 months ago

We have been discussing this in Discord. If issue still exists, drop a line here and we can reopen it.