Twilio Voice AI Integration for AI Phone Bots Twilio is the leading cloud telephony solution for developers. The Media Streams API lets you receive live call audio via Web
Socket and send back synthesized speech—the foundation for any AI voice bot. ### Integration ArchitectureCaller → Twilio PSTN → TwiML → Media Streams WebSocket → Your AI Server ↓ STT → LLM → TTS ↓ Synthesized Audio → Twilio → Caller### TwiML webhook for incoming calls```python
from fastapi import FastAPI, Request
from twilio.twiml.voice_response import VoiceResponse, Start, Stream, Say
app = FastAPI()
@app.post("/incoming-call") async def handle_incoming_call(request: Request): response = VoiceResponse()
# Запускаем Media Stream
start = Start()
start.stream(
url=f"wss://api.yourapp.com/stream",
track="both_tracks" # входящее и исходящее аудио
)
response.append(start)
# Произносим приветствие
response.say(
"Здравствуйте! Я голосовой ассистент. Как могу помочь?",
voice="alice",
language="ru-RU"
)
response.pause(length=30)
return Response(content=str(response), media_type="text/xml")
### WebSocket Media Streams Handlerpython
import asyncio
import json
import base64
from fastapi import WebSocket
@app.websocket("/stream") async def handle_stream(websocket: WebSocket): await websocket.accept() call_sid = None stream_sid = None audio_buffer = bytearray()
try:
async for message in websocket.iter_text():
data = json.loads(message)
event = data.get("event")
if event == "start":
call_sid = data["start"]["callSid"]
stream_sid = data["start"]["streamSid"]
session = create_session(call_sid)
elif event == "media":
# Twilio использует mulaw 8kHz
mulaw_audio = base64.b64decode(data["media"]["payload"])
audio_buffer.extend(mulaw_audio)
# Обрабатываем когда накопили 2 секунды (16000 bytes @ 8kHz)
if len(audio_buffer) >= 16000:
await process_audio_chunk(
bytes(audio_buffer), websocket, stream_sid, session
)
audio_buffer = bytearray()
elif event == "stop":
break
except Exception as e:
logger.error(f"Stream error: {e}")
async def send_audio_to_caller(websocket: WebSocket, stream_sid: str, audio_bytes: bytes):
"""Отправляем синтезированное аудио обратно в звонок"""
encoded = base64.b64encode(audio_bytes).decode()
await websocket.send_json({
"event": "media",
"streamSid": stream_sid,
"media": {
"payload": encoded
}
})
### Twilio audio format conversion uses μ-law (mulaw) 8kHz. Whisper uses PCM 16kHz:python
import audioop
def mulaw_to_pcm16k(mulaw_bytes: bytes) -> bytes: """μ-law 8kHz → PCM 16-bit 8kHz → upsample to 16kHz""" pcm_8k = audioop.ulaw2lin(mulaw_bytes, 2) # μ-law → PCM 16-bit pcm_16k, _ = audioop.ratecv(pcm_8k, 2, 1, 8000, 16000, None) # 8→16kHz return pcm_16k







