fix: handle nested code fences in extract_code_block

The non-greedy regex in FIRST_CODE_BLOCK_PATTERN stopped at the first
``` occurrence, even inside triple-quoted strings or nested code fence
blocks. This truncated the extracted code and lost test functions when
LLMs embedded function definitions using ```python:filepath syntax.

Switch to greedy matching and require the closing ``` to be alone on
its line so intermediate backticks are skipped.
This commit is contained in:
Kevin Turcios 2026-02-22 23:39:01 -05:00
parent ca71d0c8a0
commit c95a36cf38
2 changed files with 18 additions and 2 deletions

View file

@ -13,8 +13,10 @@ from aiservice.common.llm_output_utils import truncate_pathological_output
# Matches both ```python and ```python:filepath blocks, captures content only
MARKDOWN_CODE_BLOCK_PATTERN = re.compile(r"```python(?::[^\n]*)?\n(.*?)```", re.DOTALL)
# Matches first ```python block (no filepath), captures content
FIRST_CODE_BLOCK_PATTERN = re.compile(r"^```python\s*\n(.*?)\n```", re.MULTILINE | re.DOTALL)
# Matches first ```python block (no filepath), captures content.
# Uses greedy (.*) to handle LLM outputs with nested code fences (e.g. ```python:filepath
# blocks inside the main block). Requires closing ``` to be alone on its line.
FIRST_CODE_BLOCK_PATTERN = re.compile(r"^```python\s*\n(.*)\n```[ \t]*$", re.MULTILINE | re.DOTALL)
# Fallback for incomplete code blocks (missing closing ```)
FIRST_CODE_BLOCK_FALLBACK_PATTERN = re.compile(r"^```python\s*\n(.*)", re.MULTILINE | re.DOTALL)

View file

@ -181,6 +181,20 @@ x ="""
assert result == expected
def test_extract_code_block_nested_code_fence_in_triple_quote() -> None:
# LLM embeds function definition in a triple-quoted string containing ```
text = '```python\nimport pytest\n_source = """```python:file.py\ndef foo(): pass\n```"""\ndef test_foo():\n assert True\n```'
result = extract_code_block(text)
assert result == 'import pytest\n_source = """```python:file.py\ndef foo(): pass\n```"""\ndef test_foo():\n assert True'
def test_extract_code_block_nested_code_fence_block() -> None:
# LLM nests a ```python:filepath block inside the main block
text = '```python\nimport pytest\n```python:src/mod.py\ndef foo(): pass\n```\ndef test_foo():\n assert True\n```'
result = extract_code_block(text)
assert result == 'import pytest\n```python:src/mod.py\ndef foo(): pass\n```\ndef test_foo():\n assert True'
def test_extract_all_code_single_block() -> None:
text = "```python\ncode1\n```"
result = extract_all_code_from_markdown(text)