Closed frmsaul closed 1 month ago
Yes, listen for the session.created
event and respond with a {"type": "response.create"}
event to OpenAI to prompt the model to respond. In order to greet the caller with a specific message, update the SYSTEM_MESSAGE with an example or instructions on how to respond at the top of the conversation
That's very helpful. Thank you. I have a few more (maybe obvious / naive) questions.
Something like this?
openai_ws.send(JSON.stringify({
type: "response.create",
response: {
modalities: ["text"],
instructions: "Please assist the user"
}
}));
Do you know if this will overwrite my existing system prompt? (defined when socket opens like below)
session_update = {
"type": "session.update",
"session": {
"turn_detection": {"type": "server_vad"},
"input_audio_format": "g711_ulaw",
"output_audio_format": "g711_ulaw",
"voice": 'alloy',
"instructions": prompt,
"modalities": ["text", "audio"],
"temperature": 0.8,
}
}
await openai_ws.send(json.dumps(session_update))
Do you think i can do something like that:
openai_ws.send(JSON.stringify({
type: "response.create",
response: {
modalities: ["text"],
instructions: f"Please assist the user by saying: {greeting}"
}
}));
Ok i tried this solution, I might be misunderstanding something, but it seems to only work at most 50% of the time. This is what my code looks like (super flaky when putting it in session.created
also)
import os
import json
import base64
import asyncio
import websockets
from fastapi import FastAPI, WebSocket, Request
from fastapi.responses import HTMLResponse
from fastapi.websockets import WebSocketDisconnect
from twilio.twiml.voice_response import VoiceResponse, Connect, Say, Stream
from dotenv import load_dotenv
from app.prompt import proParams
load_dotenv()
# Configuration
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') # requires OpenAI Realtime API Access
PORT = int(os.getenv('PORT', 8765))
LOG_EVENT_TYPES = [
'response.content.done', 'rate_limits.updated', 'response.done',
'input_audio_buffer.committed', 'input_audio_buffer.speech_stopped',
'input_audio_buffer.speech_started', 'session.created'
]
app = FastAPI()
if not OPENAI_API_KEY:
raise ValueError('Missing the OpenAI API key. Please set it in the .env file.')
@app.websocket("/media-stream/{params}")
async def handle_media_stream(websocket: WebSocket, params: str):
"""Handle WebSocket connections between Twilio and OpenAI."""
parsed_params = json.loads(base64.b64decode(params).decode('utf-8'))
body_params = parsed_params.get('params')
socratic_url = parsed_params.get('socratic_url')
agent_params = proParams(body_params, socratic_url)
await websocket.accept()
async with websockets.connect(
'wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-10-01',
extra_headers={
"Authorization": f"Bearer {OPENAI_API_KEY}",
"OpenAI-Beta": "realtime=v1"
}
) as openai_ws:
await send_session_update(openai_ws, agent_params)
stream_sid = None
async def receive_from_twilio():
"""Receive audio data from Twilio and send it to the OpenAI Realtime API."""
nonlocal stream_sid
try:
async for message in websocket.iter_text():
data = json.loads(message)
if data['event'] == 'media' and openai_ws.open:
audio_append = {
"type": "input_audio_buffer.append",
"audio": data['media']['payload']
}
await openai_ws.send(json.dumps(audio_append))
elif data['event'] == 'start':
stream_sid = data['start']['streamSid']
print(f"Incoming stream has started {stream_sid}")
except WebSocketDisconnect:
print("Client disconnected.")
if openai_ws.open:
await openai_ws.close()
async def send_to_twilio():
"""Receive events from the OpenAI Realtime API, send audio back to Twilio."""
nonlocal stream_sid
try:
async for openai_message in openai_ws:
response = json.loads(openai_message)
if response['type'] in LOG_EVENT_TYPES:
print(f"Received event: {response['type']}", response)
if response['type'] == 'session.created':
pass
if response['type'] == 'session.updated':
print("Session updated successfully:", response)
greeting = agent_params['greeting']
print(f"greeting: {greeting}")
await openai_ws.send(json.dumps({
"type": "response.create",
"response": {
"modalities": ["text"],
"instructions": f"Please assist the user! Open with: '{greeting}'",
}
}))
if response['type'] == 'response.audio.delta' and response.get('delta'):
# Audio from OpenAI
try:
audio_payload = base64.b64encode(
base64.b64decode(response['delta'])
).decode('utf-8')
audio_delta = {
"event": "media",
"streamSid": stream_sid,
"media": {
"payload": audio_payload
}
}
await websocket.send_json(audio_delta)
except Exception as e:
print(f"Error processing audio data: {e}")
except Exception as e:
print(f"Error in send_to_twilio: {e}")
await asyncio.gather(receive_from_twilio(), send_to_twilio())
async def send_session_update(openai_ws, agent_params):
"""Send session update to OpenAI WebSocket."""
prompt = agent_params['prompt']
session_update = {
"type": "session.update",
"session": {
"turn_detection": {"type": "server_vad"},
"input_audio_format": "g711_ulaw",
"output_audio_format": "g711_ulaw",
"voice": 'alloy',
"instructions": prompt,
"modalities": ["text", "audio"],
"temperature": 0.8,
}
}
await openai_ws.send(json.dumps(session_update))
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=PORT)
Try responding with the greeting on the session.created
event. That works for me
This definitely doesn't work for me. I wonder if it's because i call an api to get the prompt which adds some delay and messes things up.
It's solved by doing:
openai_ws.send(JSON.stringify({
type: "response.create",
response: {
modalities: ["text", "audio"]
instructions: f"Please assist the user by saying: {greeting}"
}
}));
Basically changing:
modalities: ["text"]
-> modalities: ["text", "audio"]
Closing the issue for now!
Thanks @jhmaddox
Is there a way to:
React to the conversation starting and say something initially (without waiting for the user to say something)
Greet the caller somehow with a specific message (e.g "Hello! you've reached an assistant")