Fix: Handle LLM client close() errors gracefully
**Issue**: The `/ai/optimization_review` endpoint was returning 500 errors when trying to close LLM clients during event loop changes. **Root Cause**: In `aiservice/llm.py` lines 96-99, the `close()` calls on OpenAI and Anthropic clients were not wrapped in exception handlers. When the httpx transport was already closed or in a bad state (e.g., event loop closure, connection already closed), the exception would propagate and cause the entire request to fail with a 500 error. **Fix**: Wrapped both `openai_client.close()` and `anthropic_client.close()` in try-except blocks that catch and log exceptions at DEBUG level. This prevents transport errors from crashing requests while still attempting to clean up resources properly. **Impact**: Fixes 500 errors on `/ai/optimization_review` and other endpoints that use the LLM client when event loops change or clients are in bad states. **Testing**: Added `test_llm_client_close.py` with 2 test cases that verify: 1. Transport errors during close() are handled gracefully 2. Event loop closed errors are handled gracefully **Traces**: 312d7392, 5bbdf214, a1325051
This commit is contained in:
parent
b814e1e7e6
commit
4c4b497d2a
2 changed files with 141 additions and 2 deletions
|
|
@ -93,10 +93,23 @@ class LLMClient:
|
|||
loop = asyncio.get_running_loop()
|
||||
if loop is not self.client_loop:
|
||||
# Close old clients to prevent connection leaks and event loop closure errors
|
||||
# Ignore errors if the client is already closed or the transport is in a bad state
|
||||
if self.openai_client is not None:
|
||||
await self.openai_client.close()
|
||||
try:
|
||||
await self.openai_client.close()
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Failed to close OpenAI client (already closed or transport error): %s",
|
||||
type(e).__name__,
|
||||
)
|
||||
if self.anthropic_client is not None:
|
||||
await self.anthropic_client.close()
|
||||
try:
|
||||
await self.anthropic_client.close()
|
||||
except Exception as e:
|
||||
logger.debug(
|
||||
"Failed to close Anthropic client (already closed or transport error): %s",
|
||||
type(e).__name__,
|
||||
)
|
||||
|
||||
self.client_loop = loop
|
||||
self.background_tasks = set()
|
||||
|
|
|
|||
126
django/aiservice/aiservice/tests/test_llm_client_close.py
Normal file
126
django/aiservice/aiservice/tests/test_llm_client_close.py
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
"""
|
||||
Test for LLM client close() error handling.
|
||||
|
||||
This test verifies that the LLMClient handles exceptions gracefully when
|
||||
closing clients during event loop changes, preventing 500 errors.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
class TestLLMClientClose:
|
||||
"""Test LLMClient handles close() failures gracefully"""
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_close_handles_transport_errors(self):
|
||||
"""
|
||||
Test that client close() errors are caught and don't propagate.
|
||||
|
||||
This reproduces the bug where httpx transport errors during close()
|
||||
cause 500 errors on /ai/optimization_review.
|
||||
"""
|
||||
from aiservice.llm import LLMClient
|
||||
from aiservice.llm_models import OpenAI_GPT_4_1
|
||||
|
||||
# Create a mock client that fails on close()
|
||||
mock_openai = AsyncMock()
|
||||
mock_openai.close = AsyncMock(
|
||||
side_effect=RuntimeError("Transport already closed")
|
||||
)
|
||||
|
||||
# Patch the client creation
|
||||
with (
|
||||
patch("aiservice.llm.AsyncAzureOpenAI", return_value=mock_openai),
|
||||
patch("aiservice.llm.has_openai", True),
|
||||
patch("aiservice.llm.has_anthropic", False),
|
||||
):
|
||||
client = LLMClient()
|
||||
|
||||
# First call - creates clients
|
||||
with patch.object(client, "call_openai", AsyncMock()):
|
||||
try:
|
||||
await client.call(
|
||||
llm=OpenAI_GPT_4_1(),
|
||||
messages=[{"role": "user", "content": "test"}],
|
||||
call_type="test",
|
||||
trace_id="test-trace-1",
|
||||
)
|
||||
except Exception:
|
||||
pass # Ignore call failures, we only care about close()
|
||||
|
||||
# Force event loop change detection
|
||||
client.client_loop = None
|
||||
|
||||
# Second call - should try to close old client and handle errors gracefully
|
||||
with patch.object(client, "call_openai", AsyncMock()):
|
||||
# This should NOT raise RuntimeError from close()
|
||||
try:
|
||||
await client.call(
|
||||
llm=OpenAI_GPT_4_1(),
|
||||
messages=[{"role": "user", "content": "test"}],
|
||||
call_type="test",
|
||||
trace_id="test-trace-2",
|
||||
)
|
||||
# If we get here without exception, the bug is fixed
|
||||
assert True
|
||||
except RuntimeError as e:
|
||||
if "Transport already closed" in str(e):
|
||||
pytest.fail(
|
||||
f"close() error was not handled: {e}. "
|
||||
"This causes 500 errors in production."
|
||||
)
|
||||
raise
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_close_handles_event_loop_closed_error(self):
|
||||
"""
|
||||
Test that event loop closed errors during close() are handled.
|
||||
|
||||
This handles the case where the event loop is closed before we
|
||||
try to close the client.
|
||||
"""
|
||||
from aiservice.llm import LLMClient
|
||||
from aiservice.llm_models import OpenAI_GPT_4_1
|
||||
|
||||
mock_openai = AsyncMock()
|
||||
mock_openai.close = AsyncMock(side_effect=RuntimeError("Event loop is closed"))
|
||||
|
||||
with (
|
||||
patch("aiservice.llm.AsyncAzureOpenAI", return_value=mock_openai),
|
||||
patch("aiservice.llm.has_openai", True),
|
||||
patch("aiservice.llm.has_anthropic", False),
|
||||
):
|
||||
client = LLMClient()
|
||||
|
||||
# Create client
|
||||
with patch.object(client, "call_openai", AsyncMock()):
|
||||
try:
|
||||
await client.call(
|
||||
llm=OpenAI_GPT_4_1(),
|
||||
messages=[{"role": "user", "content": "test"}],
|
||||
call_type="test",
|
||||
trace_id="test-trace-1",
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Force event loop change
|
||||
client.client_loop = None
|
||||
|
||||
# Second call should handle close error
|
||||
with patch.object(client, "call_openai", AsyncMock()):
|
||||
try:
|
||||
await client.call(
|
||||
llm=OpenAI_GPT_4_1(),
|
||||
messages=[{"role": "user", "content": "test"}],
|
||||
call_type="test",
|
||||
trace_id="test-trace-2",
|
||||
)
|
||||
assert True
|
||||
except RuntimeError as e:
|
||||
if "Event loop is closed" in str(e):
|
||||
pytest.fail(f"Event loop error was not handled: {e}")
|
||||
raise
|
||||
Loading…
Reference in a new issue