First voice message - Githubissues

frmsaul commented 1 month ago

Is there a way to:

React to the conversation starting and say something initially (without waiting for the user to say something)
Greet the caller somehow with a specific message (e.g "Hello! you've reached an assistant")

jhmaddox commented 1 month ago

Yes, listen for the session.created event and respond with a {"type": "response.create"} event to OpenAI to prompt the model to respond. In order to greet the caller with a specific message, update the SYSTEM_MESSAGE with an example or instructions on how to respond at the top of the conversation

frmsaul commented 1 month ago

That's very helpful. Thank you. I have a few more (maybe obvious / naive) questions.

Something like this?

    openai_ws.send(JSON.stringify({
        type: "response.create",
        response: {
            modalities: ["text"],
            instructions: "Please assist the user" 
        }
    }));

Do you know if this will overwrite my existing system prompt? (defined when socket opens like below)

session_update = {
    "type": "session.update",
    "session": {
        "turn_detection": {"type": "server_vad"},
        "input_audio_format": "g711_ulaw",
        "output_audio_format": "g711_ulaw",
        "voice": 'alloy',
        "instructions": prompt,
        "modalities": ["text", "audio"],
        "temperature": 0.8,
    }
}
await openai_ws.send(json.dumps(session_update))

Do you think i can do something like that:

openai_ws.send(JSON.stringify({
    type: "response.create",
    response: {
        modalities: ["text"],
        instructions: f"Please assist the user by saying: {greeting}"
    }
}));

frmsaul commented 1 month ago

Ok i tried this solution, I might be misunderstanding something, but it seems to only work at most 50% of the time. This is what my code looks like (super flaky when putting it in session.created also)

import os
import json
import base64
import asyncio
import websockets
from fastapi import FastAPI, WebSocket, Request
from fastapi.responses import HTMLResponse
from fastapi.websockets import WebSocketDisconnect
from twilio.twiml.voice_response import VoiceResponse, Connect, Say, Stream
from dotenv import load_dotenv

from app.prompt import proParams

load_dotenv()

# Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # requires OpenAI Realtime API Access
PORT = int(os.getenv('PORT', 8765))

LOG_EVENT_TYPES = [
    'response.content.done', 'rate_limits.updated', 'response.done',
    'input_audio_buffer.committed', 'input_audio_buffer.speech_stopped',
    'input_audio_buffer.speech_started', 'session.created'
]

app = FastAPI()

if not OPENAI_API_KEY:
    raise ValueError('Missing the OpenAI API key. Please set it in the .env file.')

@app.websocket("/media-stream/{params}")
async def handle_media_stream(websocket: WebSocket, params: str):
    """Handle WebSocket connections between Twilio and OpenAI."""
    parsed_params = json.loads(base64.b64decode(params).decode('utf-8'))

    body_params = parsed_params.get('params')
    socratic_url = parsed_params.get('socratic_url')
    agent_params = proParams(body_params, socratic_url)

    await websocket.accept()

    async with websockets.connect(
        'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01',
        extra_headers={
            "Authorization": f"Bearer {OPENAI_API_KEY}",
            "OpenAI-Beta": "realtime=v1"
        }
    ) as openai_ws:
        await send_session_update(openai_ws, agent_params)
        stream_sid = None

        async def receive_from_twilio():
            """Receive audio data from Twilio and send it to the OpenAI Realtime API."""
            nonlocal stream_sid
            try:
                async for message in websocket.iter_text():
                    data = json.loads(message)
                    if data['event'] == 'media' and openai_ws.open:
                        audio_append = {
                            "type": "input_audio_buffer.append",
                            "audio": data['media']['payload']
                        }
                        await openai_ws.send(json.dumps(audio_append))
                    elif data['event'] == 'start':
                        stream_sid = data['start']['streamSid']
                        print(f"Incoming stream has started {stream_sid}")
            except WebSocketDisconnect:
                print("Client disconnected.")
                if openai_ws.open:
                    await openai_ws.close()

        async def send_to_twilio():
            """Receive events from the OpenAI Realtime API, send audio back to Twilio."""
            nonlocal stream_sid
            try:
                async for openai_message in openai_ws:
                    response = json.loads(openai_message)
                    if response['type'] in LOG_EVENT_TYPES:
                        print(f"Received event: {response['type']}", response)

                    if response['type'] == 'session.created':
                        pass
                    if response['type'] == 'session.updated':
                        print("Session updated successfully:", response)
                        greeting = agent_params['greeting']
                        print(f"greeting: {greeting}")
                        await openai_ws.send(json.dumps({
                            "type": "response.create",
                            "response": {
                                "modalities": ["text"],
                                "instructions": f"Please assist the user! Open with: '{greeting}'",
                            }
                        }))
                    if response['type'] == 'response.audio.delta' and response.get('delta'):
                        # Audio from OpenAI
                        try:
                            audio_payload = base64.b64encode(
                                base64.b64decode(response['delta'])
                            ).decode('utf-8')
                            audio_delta = {
                                "event": "media",
                                "streamSid": stream_sid,
                                "media": {
                                    "payload": audio_payload
                                }
                            }
                            await websocket.send_json(audio_delta)
                        except Exception as e:
                            print(f"Error processing audio data: {e}")
            except Exception as e:
                print(f"Error in send_to_twilio: {e}")

        await asyncio.gather(receive_from_twilio(), send_to_twilio())

async def send_session_update(openai_ws, agent_params):
    """Send session update to OpenAI WebSocket."""
    prompt = agent_params['prompt']

    session_update = {
        "type": "session.update",
        "session": {
            "turn_detection": {"type": "server_vad"},
            "input_audio_format": "g711_ulaw",
            "output_audio_format": "g711_ulaw",
            "voice": 'alloy',
            "instructions": prompt,
            "modalities": ["text", "audio"],
            "temperature": 0.8,
        }
    }
    await openai_ws.send(json.dumps(session_update))

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=PORT)

jhmaddox commented 1 month ago

Try responding with the greeting on the session.created event. That works for me

frmsaul commented 1 month ago

This definitely doesn't work for me. I wonder if it's because i call an api to get the prompt which adds some delay and messes things up.

frmsaul commented 1 month ago

It's solved by doing:

    openai_ws.send(JSON.stringify({
        type: "response.create",
        response: {
            modalities: ["text", "audio"]
            instructions: f"Please assist the user by saying: {greeting}"
        }
    }));

Basically changing: modalities: ["text"] -> modalities: ["text", "audio"]

Closing the issue for now!

Thanks @jhmaddox

twilio-samples / speech-assistant-openai-realtime-api-python

First voice message #9