Fix: Handle LLM client close() errors gracefully

**Issue**: The `/ai/optimization_review` endpoint was returning 500 errors
when trying to close LLM clients during event loop changes.

**Root Cause**: In `aiservice/llm.py` lines 96-99, the `close()` calls on
OpenAI and Anthropic clients were not wrapped in exception handlers. When
the httpx transport was already closed or in a bad state (e.g., event loop
closure, connection already closed), the exception would propagate and cause
the entire request to fail with a 500 error.

**Fix**: Wrapped both `openai_client.close()` and `anthropic_client.close()`
in try-except blocks that catch and log exceptions at DEBUG level. This
prevents transport errors from crashing requests while still attempting to
clean up resources properly.

**Impact**: Fixes 500 errors on `/ai/optimization_review` and other endpoints
that use the LLM client when event loops change or clients are in bad states.

**Testing**: Added `test_llm_client_close.py` with 2 test cases that verify:
1. Transport errors during close() are handled gracefully
2. Event loop closed errors are handled gracefully

**Traces**: 312d7392, 5bbdf214, a1325051
This commit is contained in:
mohammed ahmed 2026-04-03 19:19:19 +00:00
parent b814e1e7e6
commit 4c4b497d2a
2 changed files with 141 additions and 2 deletions

View file

@ -93,10 +93,23 @@ class LLMClient:
loop = asyncio.get_running_loop()
if loop is not self.client_loop:
# Close old clients to prevent connection leaks and event loop closure errors
# Ignore errors if the client is already closed or the transport is in a bad state
if self.openai_client is not None:
await self.openai_client.close()
try:
await self.openai_client.close()
except Exception as e:
logger.debug(
"Failed to close OpenAI client (already closed or transport error): %s",
type(e).__name__,
)
if self.anthropic_client is not None:
await self.anthropic_client.close()
try:
await self.anthropic_client.close()
except Exception as e:
logger.debug(
"Failed to close Anthropic client (already closed or transport error): %s",
type(e).__name__,
)
self.client_loop = loop
self.background_tasks = set()

View file

@ -0,0 +1,126 @@
"""
Test for LLM client close() error handling.
This test verifies that the LLMClient handles exceptions gracefully when
closing clients during event loop changes, preventing 500 errors.
"""
import asyncio
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
class TestLLMClientClose:
"""Test LLMClient handles close() failures gracefully"""
@pytest.mark.asyncio
async def test_close_handles_transport_errors(self):
"""
Test that client close() errors are caught and don't propagate.
This reproduces the bug where httpx transport errors during close()
cause 500 errors on /ai/optimization_review.
"""
from aiservice.llm import LLMClient
from aiservice.llm_models import OpenAI_GPT_4_1
# Create a mock client that fails on close()
mock_openai = AsyncMock()
mock_openai.close = AsyncMock(
side_effect=RuntimeError("Transport already closed")
)
# Patch the client creation
with (
patch("aiservice.llm.AsyncAzureOpenAI", return_value=mock_openai),
patch("aiservice.llm.has_openai", True),
patch("aiservice.llm.has_anthropic", False),
):
client = LLMClient()
# First call - creates clients
with patch.object(client, "call_openai", AsyncMock()):
try:
await client.call(
llm=OpenAI_GPT_4_1(),
messages=[{"role": "user", "content": "test"}],
call_type="test",
trace_id="test-trace-1",
)
except Exception:
pass # Ignore call failures, we only care about close()
# Force event loop change detection
client.client_loop = None
# Second call - should try to close old client and handle errors gracefully
with patch.object(client, "call_openai", AsyncMock()):
# This should NOT raise RuntimeError from close()
try:
await client.call(
llm=OpenAI_GPT_4_1(),
messages=[{"role": "user", "content": "test"}],
call_type="test",
trace_id="test-trace-2",
)
# If we get here without exception, the bug is fixed
assert True
except RuntimeError as e:
if "Transport already closed" in str(e):
pytest.fail(
f"close() error was not handled: {e}. "
"This causes 500 errors in production."
)
raise
@pytest.mark.asyncio
async def test_close_handles_event_loop_closed_error(self):
"""
Test that event loop closed errors during close() are handled.
This handles the case where the event loop is closed before we
try to close the client.
"""
from aiservice.llm import LLMClient
from aiservice.llm_models import OpenAI_GPT_4_1
mock_openai = AsyncMock()
mock_openai.close = AsyncMock(side_effect=RuntimeError("Event loop is closed"))
with (
patch("aiservice.llm.AsyncAzureOpenAI", return_value=mock_openai),
patch("aiservice.llm.has_openai", True),
patch("aiservice.llm.has_anthropic", False),
):
client = LLMClient()
# Create client
with patch.object(client, "call_openai", AsyncMock()):
try:
await client.call(
llm=OpenAI_GPT_4_1(),
messages=[{"role": "user", "content": "test"}],
call_type="test",
trace_id="test-trace-1",
)
except Exception:
pass
# Force event loop change
client.client_loop = None
# Second call should handle close error
with patch.object(client, "call_openai", AsyncMock()):
try:
await client.call(
llm=OpenAI_GPT_4_1(),
messages=[{"role": "user", "content": "test"}],
call_type="test",
trace_id="test-trace-2",
)
assert True
except RuntimeError as e:
if "Event loop is closed" in str(e):
pytest.fail(f"Event loop error was not handled: {e}")
raise