Merge remote-tracking branch 'origin/main' into litellm_/compassionate-shannon

yuneng-berri · yuneng-berri · commit c7f610c57e03 · 2026-04-09T21:15:03.000-07:00
diff --git a/docs/my-website/docs/proxy/config_settings.md b/docs/my-website/docs/proxy/config_settings.md
@@ -602,6 +602,8 @@ router_settings:
 | MCP_OAUTH2_TOKEN_CACHE_MAX_SIZE | Maximum number of entries in MCP OAuth2 token cache. Default is 200
 | MCP_OAUTH2_TOKEN_CACHE_MIN_TTL | Minimum TTL in seconds for MCP OAuth2 token cache. Default is 10
 | MCP_OAUTH2_TOKEN_EXPIRY_BUFFER_SECONDS | Seconds to subtract from token expiry when computing cache TTL. Default is 60
+| MCP_PER_USER_TOKEN_DEFAULT_TTL | Default TTL in seconds for per-user MCP OAuth tokens stored in Redis. Default is 43200 (12 hours)
+| MCP_PER_USER_TOKEN_EXPIRY_BUFFER_SECONDS | Seconds to subtract from per-user MCP OAuth token expiry when computing Redis TTL. Default is 60
 | DEFAULT_MOCK_RESPONSE_COMPLETION_TOKEN_COUNT | Default token count for mock response completions. Default is 20
 | DEFAULT_MOCK_RESPONSE_PROMPT_TOKEN_COUNT | Default token count for mock response prompts. Default is 10
 | DEFAULT_MODEL_CREATED_AT_TIME | Default creation timestamp for models. Default is 1677610602
diff --git a/litellm/llms/custom_httpx/llm_http_handler.py b/litellm/llms/custom_httpx/llm_http_handler.py
@@ -1,5 +1,6 @@
 import json
 import ssl
+from urllib.parse import parse_qs, urlencode, urlparse, urlunparse
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -5027,6 +5028,16 @@ async def async_responses_websocket(
             litellm_params={},
         )
         ws_url = http_url.replace("https://", "wss://").replace("http://", "ws://")
+        # OpenAI's WebSocket responses endpoint requires ?model= in the URL,
+        # matching the Realtime API convention (wss://.../v1/realtime?model=...).
+        # Use urllib.parse so existing query params (e.g. api-version) are preserved.
+        _parsed = urlparse(ws_url)
+        _qs = parse_qs(_parsed.query)
+        if "model" not in _qs:
+            _qs["model"] = [model]
+            ws_url = urlunparse(
+                _parsed._replace(query=urlencode({k: v[0] for k, v in _qs.items()}))
+            )
 
         try:
             ssl_context = get_shared_realtime_ssl_context()
diff --git a/tests/test_litellm/responses/test_responses_websocket_all_providers.py b/tests/test_litellm/responses/test_responses_websocket_all_providers.py
@@ -971,3 +971,106 @@ def test_extract_output_messages_with_mixed_text_types(self):
         )
         assert len(messages) == 1
         assert messages[0]["content"][0]["text"] == "Part 1Part 2"
+
+
+class TestNativeWebSocketUrlConstruction:
+    """Test that native WebSocket URLs include the model query parameter.
+
+    These tests mock websockets.connect so they exercise the actual URL-building
+    code inside BaseLLMHTTPHandler.async_responses_websocket rather than
+    reimplementing the logic themselves.
+    """
+
+    @pytest.mark.asyncio
+    async def test_openai_ws_url_includes_model(self):
+        """Handler must pass ?model= in the URL to the backend WebSocket."""
+        from unittest.mock import AsyncMock, MagicMock, patch
+
+        captured_urls = []
+
+        class FakeConnect:
+            def __init__(self, url, **kwargs):
+                captured_urls.append(url)
+
+            async def __aenter__(self):
+                raise Exception("stop")
+
+            async def __aexit__(self, *args):
+                pass
+
+        mock_config = MagicMock(spec=OpenAIResponsesAPIConfig)
+        mock_config.supports_native_websocket.return_value = True
+        mock_config.get_complete_url.return_value = "https://api.openai.com/v1/responses"
+        mock_config.validate_environment.return_value = {}
+
+        mock_logging = MagicMock()
+        mock_logging.pre_call = MagicMock()
+
+        from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
+
+        handler = BaseLLMHTTPHandler()
+
+        mock_ws = MagicMock()
+        mock_ws.close = AsyncMock()
+
+        with patch("websockets.connect", FakeConnect):
+            await handler.async_responses_websocket(
+                model="gpt-4o-mini",
+                websocket=mock_ws,
+                logging_obj=mock_logging,
+                responses_api_provider_config=mock_config,
+                api_key="sk-test",
+            )
+
+        assert len(captured_urls) == 1
+        from urllib.parse import parse_qs, urlparse
+        qs = parse_qs(urlparse(captured_urls[0]).query)
+        assert qs.get("model") == ["gpt-4o-mini"], f"Expected model in URL, got: {captured_urls[0]}"
+
+    @pytest.mark.asyncio
+    async def test_ws_url_preserves_existing_params_and_adds_model(self):
+        """When api_base already has query params, model is added alongside them."""
+        from unittest.mock import AsyncMock, MagicMock, patch
+
+        captured_urls = []
+
+        class FakeConnect:
+            def __init__(self, url, **kwargs):
+                captured_urls.append(url)
+
+            async def __aenter__(self):
+                raise Exception("stop")
+
+            async def __aexit__(self, *args):
+                pass
+
+        mock_config = MagicMock(spec=OpenAIResponsesAPIConfig)
+        mock_config.supports_native_websocket.return_value = True
+        mock_config.get_complete_url.return_value = (
+            "https://custom.example.com/v1/responses?api-version=2024-05-01"
+        )
+        mock_config.validate_environment.return_value = {}
+
+        mock_logging = MagicMock()
+        mock_logging.pre_call = MagicMock()
+
+        from litellm.llms.custom_httpx.llm_http_handler import BaseLLMHTTPHandler
+
+        handler = BaseLLMHTTPHandler()
+        mock_ws = MagicMock()
+        mock_ws.close = AsyncMock()
+
+        with patch("websockets.connect", FakeConnect):
+            await handler.async_responses_websocket(
+                model="gpt-4o",
+                websocket=mock_ws,
+                logging_obj=mock_logging,
+                responses_api_provider_config=mock_config,
+                api_key="sk-test",
+            )
+
+        assert len(captured_urls) == 1
+        from urllib.parse import parse_qs, urlparse
+        qs = parse_qs(urlparse(captured_urls[0]).query)
+        assert qs.get("model") == ["gpt-4o"], f"model missing from URL: {captured_urls[0]}"
+        assert qs.get("api-version") == ["2024-05-01"], f"existing param lost: {captured_urls[0]}"