mirror of
https://github.com/codeflash-ai/codeflash-internal.git
synced 2026-05-04 18:25:18 +00:00
Optimize is_markdown_structure_changed
This optimization achieves a **17% runtime improvement** by eliminating redundant regex pattern compilation through intelligent caching. ## Key Optimization The core change extracts regex pattern compilation into a separate `_compile_pattern()` function decorated with `@lru_cache(maxsize=8)`. This is highly effective because: 1. **Eliminates the Primary Bottleneck**: The line profiler shows that `re.compile()` consumed **59.8%** of execution time in the original code (11.46ms out of 19.16ms total). Every call to `split_markdown_code()` was recompiling the same pattern. 2. **High Cache Hit Rate**: With only a few language variants (python, javascript/js, typescript/ts, and occasional custom languages like c++), a cache size of 8 easily accommodates all common patterns. The test results show consistent 40-80% speedups per call, indicating excellent cache utilization. 3. **Function References Show Repeated Calls**: The calling contexts reveal `split_markdown_code()` is invoked multiple times in validation loops: - In `code_repair_context.py`: Called twice per validation (once for structure check, once for code extraction) - In `refiner_context.py`: Called during iterative refinement processes These repeated calls with the same language parameter create an ideal scenario for caching. ## Performance Characteristics - **Best case**: Tests with identical language parameters show 80-200% speedups (e.g., empty string tests, plain code block tests) - **Typical case**: Most practical tests show 40-90% speedups, reflecting cache hits on the pattern compilation - **Large scale**: Even with 500 files, the optimization maintains 1-2% improvements, as pattern compilation overhead is amortized across all file extractions The optimization is particularly valuable in hot paths where validation occurs repeatedly during code refinement and repair workflows, making the cached pattern compilation a multiplier effect across the entire pipeline.
This commit is contained in:
parent
c19d9f4450
commit
55f0ec7929
1 changed files with 35 additions and 12 deletions
|
|
@ -7,6 +7,7 @@ in markdown format, supporting both plain ```python and ```python:filepath varia
|
|||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from functools import lru_cache
|
||||
|
||||
from aiservice.common.llm_output_utils import truncate_pathological_output
|
||||
|
||||
|
|
@ -14,10 +15,14 @@ from aiservice.common.llm_output_utils import truncate_pathological_output
|
|||
MARKDOWN_CODE_BLOCK_PATTERN = re.compile(r"```python(?::[^\n]*)?\n(.*?)```", re.DOTALL)
|
||||
|
||||
# Matches first ```python block (no filepath), captures content
|
||||
FIRST_CODE_BLOCK_PATTERN = re.compile(r"^```python\s*\n(.*?)\n```", re.MULTILINE | re.DOTALL)
|
||||
FIRST_CODE_BLOCK_PATTERN = re.compile(
|
||||
r"^```python\s*\n(.*?)\n```", re.MULTILINE | re.DOTALL
|
||||
)
|
||||
|
||||
# Fallback for incomplete code blocks (missing closing ```)
|
||||
FIRST_CODE_BLOCK_FALLBACK_PATTERN = re.compile(r"^```python\s*\n(.*)", re.MULTILINE | re.DOTALL)
|
||||
FIRST_CODE_BLOCK_FALLBACK_PATTERN = re.compile(
|
||||
r"^```python\s*\n(.*)", re.MULTILINE | re.DOTALL
|
||||
)
|
||||
|
||||
|
||||
def extract_all_code_from_markdown(markdown: str) -> str:
|
||||
|
|
@ -55,15 +60,7 @@ def split_markdown_code(markdown: str, language: str = "python") -> dict[str, st
|
|||
this will also match common aliases (js, ts).
|
||||
|
||||
"""
|
||||
# Build pattern based on language - handle common aliases
|
||||
if language in ("javascript", "js"):
|
||||
lang_pattern = r"(?:javascript|js)"
|
||||
elif language in ("typescript", "ts"):
|
||||
lang_pattern = r"(?:typescript|ts)"
|
||||
else:
|
||||
lang_pattern = re.escape(language)
|
||||
|
||||
pattern = re.compile(rf"```{lang_pattern}:([^\n]+)\n(.*?)\n```", re.DOTALL)
|
||||
pattern = _compile_pattern(language)
|
||||
matches = pattern.findall(markdown)
|
||||
result: dict[str, str] = {}
|
||||
for file_path, code in matches:
|
||||
|
|
@ -73,7 +70,9 @@ def split_markdown_code(markdown: str, language: str = "python") -> dict[str, st
|
|||
return result
|
||||
|
||||
|
||||
def extract_code_block_with_context(text: str, language: str = "python") -> tuple[str, str, str] | None:
|
||||
def extract_code_block_with_context(
|
||||
text: str, language: str = "python"
|
||||
) -> tuple[str, str, str] | None:
|
||||
"""Extract a code block and its surrounding context.
|
||||
|
||||
Returns (before_text, code_content, after_text) or None if not found.
|
||||
|
|
@ -108,3 +107,27 @@ def wrap_code_in_markdown(code: str, language: str = "python") -> str:
|
|||
|
||||
"""
|
||||
return f"```{language}\n{code}\n```"
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def _compile_pattern(language: str) -> re.Pattern:
|
||||
"""Cache compiled regex patterns for different languages."""
|
||||
if language in ("javascript", "js"):
|
||||
lang_pattern = r"(?:javascript|js)"
|
||||
elif language in ("typescript", "ts"):
|
||||
lang_pattern = r"(?:typescript|ts)"
|
||||
else:
|
||||
lang_pattern = re.escape(language)
|
||||
return re.compile(rf"```{lang_pattern}:([^\n]+)\n(.*?)\n```", re.DOTALL)
|
||||
|
||||
|
||||
@lru_cache(maxsize=8)
|
||||
def _compile_pattern(language: str) -> re.Pattern:
|
||||
"""Cache compiled regex patterns for different languages."""
|
||||
if language in ("javascript", "js"):
|
||||
lang_pattern = r"(?:javascript|js)"
|
||||
elif language in ("typescript", "ts"):
|
||||
lang_pattern = r"(?:typescript|ts)"
|
||||
else:
|
||||
lang_pattern = re.escape(language)
|
||||
return re.compile(rf"```{lang_pattern}:([^\n]+)\n(.*?)\n```", re.DOTALL)
|
||||
|
|
|
|||
Loading…
Reference in a new issue