Fix: Handle LLM client close() errors gracefully

**Issue**: The `/ai/optimization_review` endpoint was returning 500 errors when trying to close LLM clients during event loop changes. **Root Cause**: In `aiservice/llm.py` lines 96-99, the `close()` calls on OpenAI and Anthropic clients were not wrapped in exception handlers. When the httpx transport was already closed or in a bad state (e.g., event loop closure, connection already closed), the exception would propagate and cause the entire request to fail with a 500 error. **Fix**: Wrapped both `openai_client.close()` and `anthropic_client.close()` in try-except blocks that catch and log exceptions at DEBUG level. This prevents transport errors from crashing requests while still attempting to clean up resources properly. **Impact**: Fixes 500 errors on `/ai/optimization_review` and other endpoints that use the LLM client when event loops change or clients are in bad states. **Testing**: Added `test_llm_client_close.py` with 2 test cases that verify: 1. Transport errors during close() are handled gracefully 2. Event loop closed errors are handled gracefully **Traces**: 312d7392, 5bbdf214, a1325051
2026-04-03 19:19:19 +00:00 · 2026-04-03 19:19:19 +00:00 · 4c4b497d2a
commit 4c4b497d2a
parent b814e1e7e6
2 changed files with 141 additions and 2 deletions
--- a/django/aiservice/aiservice/llm.py
+++ b/django/aiservice/aiservice/llm.py
@ -93,10 +93,23 @@ class LLMClient:
        loop = asyncio.get_running_loop()
        if loop is not self.client_loop:
            # Close old clients to prevent connection leaks and event loop closure errors
+            # Ignore errors if the client is already closed or the transport is in a bad state
            if self.openai_client is not None:
-                await self.openai_client.close()
+                try:
+                    await self.openai_client.close()
+                except Exception as e:
+                    logger.debug(
+                        "Failed to close OpenAI client (already closed or transport error): %s",
+                        type(e).__name__,
+                    )
            if self.anthropic_client is not None:
-                await self.anthropic_client.close()
+                try:
+                    await self.anthropic_client.close()
+                except Exception as e:
+                    logger.debug(
+                        "Failed to close Anthropic client (already closed or transport error): %s",
+                        type(e).__name__,
+                    )

            self.client_loop = loop
            self.background_tasks = set()
--- a/django/aiservice/aiservice/tests/test_llm_client_close.py
+++ b/django/aiservice/aiservice/tests/test_llm_client_close.py
@ -0,0 +1,126 @@
+"""
+Test for LLM client close() error handling.
+
+This test verifies that the LLMClient handles exceptions gracefully when
+closing clients during event loop changes, preventing 500 errors.
+"""
+
+import asyncio
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+
+class TestLLMClientClose:
+    """Test LLMClient handles close() failures gracefully"""
+
+    @pytest.mark.asyncio
+    async def test_close_handles_transport_errors(self):
+        """
+        Test that client close() errors are caught and don't propagate.
+
+        This reproduces the bug where httpx transport errors during close()
+        cause 500 errors on /ai/optimization_review.
+        """
+        from aiservice.llm import LLMClient
+        from aiservice.llm_models import OpenAI_GPT_4_1
+
+        # Create a mock client that fails on close()
+        mock_openai = AsyncMock()
+        mock_openai.close = AsyncMock(
+            side_effect=RuntimeError("Transport already closed")
+        )
+
+        # Patch the client creation
+        with (
+            patch("aiservice.llm.AsyncAzureOpenAI", return_value=mock_openai),
+            patch("aiservice.llm.has_openai", True),
+            patch("aiservice.llm.has_anthropic", False),
+        ):
+            client = LLMClient()
+
+            # First call - creates clients
+            with patch.object(client, "call_openai", AsyncMock()):
+                try:
+                    await client.call(
+                        llm=OpenAI_GPT_4_1(),
+                        messages=[{"role": "user", "content": "test"}],
+                        call_type="test",
+                        trace_id="test-trace-1",
+                    )
+                except Exception:
+                    pass  # Ignore call failures, we only care about close()
+
+            # Force event loop change detection
+            client.client_loop = None
+
+            # Second call - should try to close old client and handle errors gracefully
+            with patch.object(client, "call_openai", AsyncMock()):
+                # This should NOT raise RuntimeError from close()
+                try:
+                    await client.call(
+                        llm=OpenAI_GPT_4_1(),
+                        messages=[{"role": "user", "content": "test"}],
+                        call_type="test",
+                        trace_id="test-trace-2",
+                    )
+                    # If we get here without exception, the bug is fixed
+                    assert True
+                except RuntimeError as e:
+                    if "Transport already closed" in str(e):
+                        pytest.fail(
+                            f"close() error was not handled: {e}. "
+                            "This causes 500 errors in production."
+                        )
+                    raise
+
+    @pytest.mark.asyncio
+    async def test_close_handles_event_loop_closed_error(self):
+        """
+        Test that event loop closed errors during close() are handled.
+
+        This handles the case where the event loop is closed before we
+        try to close the client.
+        """
+        from aiservice.llm import LLMClient
+        from aiservice.llm_models import OpenAI_GPT_4_1
+
+        mock_openai = AsyncMock()
+        mock_openai.close = AsyncMock(side_effect=RuntimeError("Event loop is closed"))
+
+        with (
+            patch("aiservice.llm.AsyncAzureOpenAI", return_value=mock_openai),
+            patch("aiservice.llm.has_openai", True),
+            patch("aiservice.llm.has_anthropic", False),
+        ):
+            client = LLMClient()
+
+            # Create client
+            with patch.object(client, "call_openai", AsyncMock()):
+                try:
+                    await client.call(
+                        llm=OpenAI_GPT_4_1(),
+                        messages=[{"role": "user", "content": "test"}],
+                        call_type="test",
+                        trace_id="test-trace-1",
+                    )
+                except Exception:
+                    pass
+
+            # Force event loop change
+            client.client_loop = None
+
+            # Second call should handle close error
+            with patch.object(client, "call_openai", AsyncMock()):
+                try:
+                    await client.call(
+                        llm=OpenAI_GPT_4_1(),
+                        messages=[{"role": "user", "content": "test"}],
+                        call_type="test",
+                        trace_id="test-trace-2",
+                    )
+                    assert True
+                except RuntimeError as e:
+                    if "Event loop is closed" in str(e):
+                        pytest.fail(f"Event loop error was not handled: {e}")
+                    raise