deepgram / deepgram-python-sdk

Official Python SDK for Deepgram's automated speech recognition APIs.
https://developers.deepgram.com
MIT License
239 stars 63 forks source link

Python Websocket live transcription example not working #465

Closed sourav-bz closed 1 month ago

sourav-bz commented 1 month ago

Bug Report: Deepgram WebSocket Connection Issue

Current Behavior

When running the script stt-dg.py, the following output is observed:

python stt-dg.py
WebSocket server started on ws://localhost:8765
New client connected
/Users/sourav/Desktop/daily-bot-play/python/stt-dg.py:28: DeprecatedWarning: live is deprecated as of 3.4.0 and will be removed in 4.0.0. deepgram.listen.live is deprecated. Use deepgram.listen.websocket instead.
  dg_connection = deepgram.listen.live.v("1")
WebSocketException in ListenWebSocketClient.start: handle_websocket.<locals>.on_open() takes 0 positional arguments but 2 were given
Failed to connect to Deepgram

Issue Description

The script is not connecting and listening to the WebSocket for live transcripts as expected.

Steps to Reproduce

The following code is being used:

import asyncio
import websockets
import json
from dotenv import load_dotenv
import os
import logging

from deepgram import (
    DeepgramClient,
    DeepgramClientOptions,
    LiveTranscriptionEvents,
    LiveOptions,
    Microphone,
)

load_dotenv()

# We will collect the is_final=true messages here so we can use them when the person finishes speaking
is_finals = []

# Initialize Deepgram client
deepgram = DeepgramClient(os.getenv("DEEPGRAM_API_KEY"))

async def handle_websocket(websocket, path):
    print("New client connected")

    # Set up Deepgram live transcription
    dg_connection = deepgram.listen.live.v("1")

    def on_open():
        print("Connection Open")

    async def on_message(result):
        global is_finals
        sentence = result.channel.alternatives[0].transcript
        if len(sentence) == 0:
            return
        if result.is_final:
            print(f"Message: {result.to_json()}")
            is_finals.append(sentence)

            if result.speech_final:
                utterance = " ".join(is_finals)
                print(f"Speech Final: {utterance}")
                await websocket.send(json.dumps({"type": "speech_final", "transcript": utterance}))
                is_finals = []
            else:
                print(f"Is Final: {sentence}")
                await websocket.send(json.dumps({"type": "is_final", "transcript": sentence}))
        else:
            print(f"Interim Results: {sentence}")
            await websocket.send(json.dumps({"type": "interim", "transcript": sentence}))

    def on_metadata(metadata):
        print(f"Metadata: {metadata}")

    def on_speech_started(speech_started):
        print("Speech Started")

    async def on_utterance_end(utterance_end):
        print("Utterance End")
        global is_finals
        if len(is_finals) > 0:
            utterance = " ".join(is_finals)
            print(f"Utterance End: {utterance}")
            await websocket.send(json.dumps({"type": "utterance_end", "transcript": utterance}))
            is_finals = []

    def on_close(close):
        print("Connection Closed")

    def on_error(error):
        print(f"Handled Error: {error}")

    def on_unhandled(unhandled):
        print(f"Unhandled Websocket Message: {unhandled}")

    dg_connection.on(LiveTranscriptionEvents.Open, on_open)
    dg_connection.on(LiveTranscriptionEvents.Transcript, on_message)
    dg_connection.on(LiveTranscriptionEvents.Metadata, on_metadata)
    dg_connection.on(LiveTranscriptionEvents.SpeechStarted, on_speech_started)
    dg_connection.on(LiveTranscriptionEvents.UtteranceEnd, on_utterance_end)
    dg_connection.on(LiveTranscriptionEvents.Close, on_close)
    dg_connection.on(LiveTranscriptionEvents.Error, on_error)
    dg_connection.on(LiveTranscriptionEvents.Unhandled, on_unhandled)

    options = LiveOptions(
        model="nova-2",
        language="en-US",
        smart_format=True,
        encoding="linear16",
        channels=1,
        sample_rate=16000,
        interim_results=True,
        utterance_end_ms="1000",
        vad_events=True,
        endpointing=300,
    )

    addons = {
        "no_delay": "true"
    }

    if dg_connection.start(options, addons=addons) is False:
        print("Failed to connect to Deepgram")
        return

    try:
        async for message in websocket:
            if isinstance(message, bytes):
                await dg_connection.send(message)
            else:
                print(f"Received unexpected message type: {type(message)}")
    except websockets.exceptions.ConnectionClosedError:
        print("Client disconnected")
    except Exception as e:
        print(f"Error in WebSocket communication: {e}")
    finally:
        dg_connection.finish()

async def main():
    server = await websockets.serve(handle_websocket, "localhost", 8765)
    print("WebSocket server started on ws://localhost:8765")
    await server.wait_closed()

if __name__ == "__main__":
    asyncio.run(main())

Environment

davidvonthenen commented 1 month ago

Hi @sourav-bz

Please take a look at the Async example: https://github.com/deepgram/deepgram-python-sdk/blob/main/examples/speech-to-text/websocket/async_http/main.py

You are missing a bunch of async declarations in your code. Example: def on_open(): Should be: async def on_open():

The call to start() is also incorrect (missing await): if dg_connection.start(options, addons=addons) is False: Should be: if await dg_connection.start(options) is False:

You have a number of errors like that, which will cause calls to those functions to hang.

davidvonthenen commented 1 month ago

If you are still having problems, drop by Discord and I would be happy to discuss!