Realtime: twilio example

rm-openai · rm-openai · commit ecf4a0031990 · 2025-07-29T14:11:23.000-04:00
diff --git a/docs/scripts/generate_ref_files.py b/docs/scripts/generate_ref_files.py
@@ -31,6 +31,7 @@ def md_target(py_path: Path) -> Path:
     rel = py_path.relative_to(SRC_ROOT).with_suffix(".md")
     return DOCS_ROOT / rel
 
+
 def pretty_title(last_segment: str) -> str:
     """
     Convert a module/file segment like 'tool_context' to 'Tool Context'.
@@ -39,6 +40,7 @@ def pretty_title(last_segment: str) -> str:
     cleaned = last_segment.replace("_", " ").replace("-", " ")
     return capwords(cleaned)
 
+
 # ---- Main ------------------------------------------------------------
 
 
diff --git a/examples/realtime/twilio/README.md b/examples/realtime/twilio/README.md
@@ -0,0 +1,86 @@
+# Realtime Twilio Integration
+
+This example demonstrates how to connect the OpenAI Realtime API to a phone call using Twilio's Media Streams. The server handles incoming phone calls and streams audio between Twilio and the OpenAI Realtime API, enabling real-time voice conversations with an AI agent over the phone.
+
+## Prerequisites
+
+-   Python 3.9+
+-   OpenAI API key with Realtime API access
+-   Twilio account with a phone number
+-   A tunneling service like ngrok to expose your local server
+
+## Setup
+
+1. **Start the server:**
+
+    ```bash
+    uv run server.py
+    ```
+
+    The server will start on port 8000 by default.
+
+2. **Expose the server publicly, e.g. via ngrok:**
+
+    ```bash
+    ngrok http 8000
+    ```
+
+    Note the public URL (e.g., `https://abc123.ngrok.io`)
+
+3. **Configure your Twilio phone number:**
+    - Log into your Twilio Console
+    - Select your phone number
+    - Set the webhook URL for incoming calls to: `https://your-ngrok-url.ngrok.io/incoming-call`
+    - Set the HTTP method to POST
+
+## Usage
+
+1. Call your Twilio phone number
+2. You'll hear: "Hello! You're now connected to an AI assistant. You can start talking!"
+3. Start speaking - the AI will respond in real-time
+4. The assistant has access to tools like weather information and current time
+
+## How It Works
+
+1. **Incoming Call**: When someone calls your Twilio number, Twilio makes a request to `/incoming-call`
+2. **TwiML Response**: The server returns TwiML that:
+    - Plays a greeting message
+    - Connects the call to a WebSocket stream at `/media-stream`
+3. **WebSocket Connection**: Twilio establishes a WebSocket connection for bidirectional audio streaming
+4. **Transport Layer**: The `TwilioRealtimeTransportLayer` class owns the WebSocket message handling:
+    - Takes ownership of the Twilio WebSocket after initial handshake
+    - Runs its own message loop to process all Twilio messages
+    - Handles protocol differences between Twilio and OpenAI
+    - Automatically sets G.711 μ-law audio format for Twilio compatibility
+    - Manages audio chunk tracking for interruption support
+    - Wraps the OpenAI realtime model instead of subclassing it
+5. **Audio Processing**:
+    - Audio from the caller is base64 decoded and sent to OpenAI Realtime API
+    - Audio responses from OpenAI are base64 encoded and sent back to Twilio
+    - Twilio plays the audio to the caller
+
+## Configuration
+
+- **Port**: Set `PORT` environment variable (default: 8000)
+- **OpenAI API Key**: Set `OPENAI_API_KEY` environment variable
+- **Agent Instructions**: Modify the `RealtimeAgent` configuration in `server.py`
+- **Tools**: Add or modify function tools in `server.py`
+
+## Troubleshooting
+
+- **WebSocket connection issues**: Ensure your ngrok URL is correct and publicly accessible
+- **Audio quality**: Twilio streams audio in mulaw format at 8kHz, which may affect quality
+- **Latency**: Network latency between Twilio, your server, and OpenAI affects response time
+- **Logs**: Check the console output for detailed connection and error logs
+
+## Architecture
+
+```
+Phone Call → Twilio → WebSocket → TwilioRealtimeTransportLayer → OpenAI Realtime API
+                                              ↓
+                                      RealtimeAgent with Tools
+                                              ↓
+                           Audio Response → Twilio → Phone Call
+```
+
+The `TwilioRealtimeTransportLayer` acts as a bridge between Twilio's Media Streams and OpenAI's Realtime API, handling the protocol differences and audio format conversions. It wraps the OpenAI realtime model to provide a clean interface for Twilio integration.
diff --git a/examples/realtime/twilio/__init__.py b/examples/realtime/twilio/__init__.py
diff --git a/examples/realtime/twilio/requirements.txt b/examples/realtime/twilio/requirements.txt
@@ -0,0 +1,4 @@
+fastapi
+uvicorn[standard]
+websockets
+python-dotenv
diff --git a/examples/realtime/twilio/server.py b/examples/realtime/twilio/server.py
@@ -0,0 +1,80 @@
+import os
+from typing import TYPE_CHECKING
+
+from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
+from fastapi.responses import PlainTextResponse
+
+# Import TwilioHandler class - handle both module and package use cases
+if TYPE_CHECKING:
+    # For type checking, use the relative import
+    from .twilio_handler import TwilioHandler
+else:
+    # At runtime, try both import styles
+    try:
+        # Try relative import first (when used as a package)
+        from .twilio_handler import TwilioHandler
+    except ImportError:
+        # Fall back to direct import (when run as a script)
+        from twilio_handler import TwilioHandler
+
+
+class TwilioWebSocketManager:
+    def __init__(self):
+        self.active_handlers: dict[str, TwilioHandler] = {}
+
+    async def new_session(self, websocket: WebSocket) -> TwilioHandler:
+        """Create and configure a new session."""
+        print("Creating twilio handler")
+
+        handler = TwilioHandler(websocket)
+        return handler
+
+    # In a real app, you'd also want to clean up/close the handler when the call ends
+
+
+manager = TwilioWebSocketManager()
+app = FastAPI()
+
+
+@app.get("/")
+async def root():
+    return {"message": "Twilio Media Stream Server is running!"}
+
+
+@app.post("/incoming-call")
+@app.get("/incoming-call")
+async def incoming_call(request: Request):
+    """Handle incoming Twilio phone calls"""
+    host = request.headers.get("Host")
+
+    twiml_response = f"""<?xml version="1.0" encoding="UTF-8"?>
+<Response>
+    <Say>Hello! You're now connected to an AI assistant. You can start talking!</Say>
+    <Connect>
+        <Stream url="wss://{host}/media-stream" />
+    </Connect>
+</Response>"""
+    return PlainTextResponse(content=twiml_response, media_type="text/xml")
+
+
+@app.websocket("/media-stream")
+async def media_stream_endpoint(websocket: WebSocket):
+    """WebSocket endpoint for Twilio Media Streams"""
+
+    try:
+        handler = await manager.new_session(websocket)
+        await handler.start()
+
+        await handler.wait_until_done()
+
+    except WebSocketDisconnect:
+        print("WebSocket disconnected")
+    except Exception as e:
+        print(f"WebSocket error: {e}")
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    port = int(os.getenv("PORT", 8000))
+    uvicorn.run(app, host="0.0.0.0", port=port)
diff --git a/examples/realtime/twilio/twilio_handler.py b/examples/realtime/twilio/twilio_handler.py