OpenAI Realtime API Integration for Voice AI OpenAI Realtime API (October 2024) — Web
Socket API for creating voice assistants with 200–500 ms latency. Eliminates the need to build a separate STT+LLM+TTS pipeline — all in a single connection. Supports user interruption. ### Key Features - Native voice-to-voice without intermediate transcriptions - Server VAD: automatic start/end detection - Interruption handling: user can interrupt a response - Function calling in voice mode - Support for text and audio simultaneously ### WebSocket Integration```python import asyncio import json import websockets import base64
async def voice_assistant(): url = "wss://api.openai.com/v1/realtime?model=gpt-4o-realtime-preview-2024-12-17" headers = { "Authorization": f"Bearer {OPENAI_API_KEY}", "OpenAI-Beta": "realtime=v1" }
async with websockets.connect(url, extra_headers=headers) as ws:
# Инициализируем сессию
await ws.send(json.dumps({
"type": "session.update",
"session": {
"modalities": ["text", "audio"],
"instructions": "Ты полезный голосовой ассистент. Отвечай по-русски, кратко.",
"voice": "alloy", # alloy | echo | fable | onyx | nova | shimmer
"input_audio_format": "pcm16",
"output_audio_format": "pcm16",
"input_audio_transcription": {"model": "whisper-1"},
"turn_detection": {
"type": "server_vad",
"threshold": 0.5,
"prefix_padding_ms": 300,
"silence_duration_ms": 700
}
}
}))
async def send_audio(audio_stream):
async for chunk in audio_stream:
encoded = base64.b64encode(chunk).decode()
await ws.send(json.dumps({
"type": "input_audio_buffer.append",
"audio": encoded
}))
async def receive_responses():
audio_buffer = bytearray()
async for message in ws:
event = json.loads(message)
if event["type"] == "response.audio.delta":
audio_data = base64.b64decode(event["delta"])
audio_buffer.extend(audio_data)
# Воспроизводим чанки по мере поступления
elif event["type"] == "response.audio.done":
# Вся аудиодорожка получена
pass
elif event["type"] == "conversation.item.input_audio_transcription.completed":
print(f"User: {event['transcript']}")
await asyncio.gather(send_audio(get_microphone_stream()),
receive_responses())
### Function calling in voice modepython
tools = [{
"type": "function",
"name": "get_order_status",
"description": "Получить статус заказа по номеру",
"parameters": {
"type": "object",
"properties": {
"order_id": {"type": "string", "description": "Номер заказа"}
},
"required": ["order_id"]
}
}]
await ws.send(json.dumps({ "type": "session.update", "session": {"tools": tools, "tool_choice": "auto"} }))







