mirror of
https://github.com/codeflash-ai/codeflash-internal.git
synced 2026-05-04 18:25:18 +00:00
Optimize find_with_whitespace_flexibility
The optimized code achieves a **97% runtime improvement** (from 1.21ms to 613μs) by replacing expensive regex compilation and matching with a **custom literal-match engine** that tokenizes the search string and uses fast string operations. ## Key Optimizations ### 1. Eliminated Regex Overhead (95%+ of original time) The original code spent 95.4% of execution time in `re.search()` (117.7ms out of 123.4ms total). The optimization removes all regex operations: - **No `re.split()`** to parse the search string - **No `re.match()`** to check for whitespace patterns - **No `re.escape()`** to handle special characters - **No `re.search()`** with dynamically built patterns Instead, it uses **manual tokenization** with simple string operations (`isspace()`, `startswith()`, `find()`), which are orders of magnitude faster in CPython. ### 2. Fast-Path Matching Strategy The code uses `content.find(first_text)` to jump directly to candidate match positions when the first token is non-whitespace. This avoids scanning character-by-character through the entire content: ```python idx = content.find(first_text, start_search) ``` For common cases where the search starts with non-whitespace text, this provides immediate candidate positions to validate. ### 3. Efficient Whitespace Handling Instead of regex's `\s+` pattern matching, the code uses: - **`str.isspace()`** for character classification (highly optimized C-level operation) - **Direct character scanning** to consume whitespace sequences - **`str.startswith()`** for literal text matching (faster than regex alternation) ## Performance Characteristics Based on the annotated tests: - **Simple matches**: 2-3× faster (10-13μs → 4-6μs) - **Special character handling**: 2-3× faster, without needing `re.escape()` - **Large search patterns** (100+ tokens): 2× faster (190μs → 87.8μs) - **Empty search**: 3× faster (2.1μs → 581ns) via early exit - **Cases with very long whitespace sequences** (1000+ spaces): May be slower due to character-by-character scanning, but these are rare edge cases ## Impact on Workloads From `function_references`, this function is called in `apply_patches()` as a **fallback when exact string matching fails**. Since it's in the hot path of diff application: 1. **Frequent fallback cases** (whitespace mismatches between LLM output and actual code) will see dramatic speedups 2. **Batch patch operations** benefit from cumulative time savings across multiple blocks 3. The function must handle **arbitrary whitespace variations** (tabs vs spaces, different indentation), which the optimization handles efficiently The optimization is particularly effective for the common case where search patterns have normal text tokens separated by whitespace, which describes typical code diff scenarios.
This commit is contained in:
parent
c19d9f4450
commit
1ae69518f2
1 changed files with 75 additions and 26 deletions
|
|
@ -2,7 +2,10 @@ import re
|
|||
|
||||
from pydantic import ValidationError
|
||||
|
||||
from optimizer.context_utils.constants import MULTI_REPLACE_IN_FILE_TAGS_REGEX, REPLACE_IN_FILE_TAGS_REGEX
|
||||
from optimizer.context_utils.constants import (
|
||||
MULTI_REPLACE_IN_FILE_TAGS_REGEX,
|
||||
REPLACE_IN_FILE_TAGS_REGEX,
|
||||
)
|
||||
from optimizer.diff_patches_utils.diff import Diff
|
||||
|
||||
|
||||
|
|
@ -51,13 +54,17 @@ def parse_diff(diff: str) -> list[SearchReplaceBlock]:
|
|||
replace_end = idx
|
||||
|
||||
if idx >= n:
|
||||
raise ValueError("Invalid diff format: Missing '>>>>>>> REPLACE' marker")
|
||||
raise ValueError(
|
||||
"Invalid diff format: Missing '>>>>>>> REPLACE' marker"
|
||||
)
|
||||
|
||||
search_content = "".join(lines[search_start:search_end]).rstrip()
|
||||
replace_content = "".join(lines[replace_start:replace_end]).rstrip()
|
||||
|
||||
try:
|
||||
block = SearchReplaceBlock.from_block(search=search_content, replace=replace_content)
|
||||
block = SearchReplaceBlock.from_block(
|
||||
search=search_content, replace=replace_content
|
||||
)
|
||||
blocks.append(block)
|
||||
except ValidationError as ve:
|
||||
raise ValueError(f"Invalid block format: {ve}")
|
||||
|
|
@ -105,7 +112,9 @@ def extract_patches(content: str) -> dict[str, str]:
|
|||
return group_diff_patches_by_path(replace_tags)
|
||||
|
||||
|
||||
def find_with_whitespace_flexibility(search: str, content: str) -> tuple[int, int] | None:
|
||||
def find_with_whitespace_flexibility(
|
||||
search: str, content: str
|
||||
) -> tuple[int, int] | None:
|
||||
"""
|
||||
Find the search string in content with whitespace flexibility.
|
||||
|
||||
|
|
@ -114,29 +123,65 @@ def find_with_whitespace_flexibility(search: str, content: str) -> tuple[int, in
|
|||
|
||||
Returns (start_idx, end_idx) of the match in the original content, or None if not found.
|
||||
"""
|
||||
# Split by whitespace sequences, preserving both whitespace and non-whitespace parts
|
||||
parts = re.split(r"(\s+)", search)
|
||||
pattern_parts = []
|
||||
|
||||
for part in parts:
|
||||
if not part:
|
||||
continue
|
||||
if re.match(r"^\s+$", part):
|
||||
# This is a whitespace sequence - make it flexible (match any whitespace)
|
||||
pattern_parts.append(r"\s+")
|
||||
else:
|
||||
# This is non-whitespace - escape it for literal matching
|
||||
pattern_parts.append(re.escape(part))
|
||||
|
||||
if not pattern_parts:
|
||||
if not search:
|
||||
return None
|
||||
|
||||
pattern = "".join(pattern_parts)
|
||||
# Tokenize the search string into runs of whitespace and non-whitespace
|
||||
tokens: list[tuple[bool, str]] = []
|
||||
i = 0
|
||||
search_len = len(search)
|
||||
while i < search_len:
|
||||
j = i + 1
|
||||
is_whitespace = search[i].isspace()
|
||||
while j < search_len and search[j].isspace() == is_whitespace:
|
||||
j += 1
|
||||
tokens.append((is_whitespace, search[i:j]))
|
||||
i = j
|
||||
|
||||
match = re.search(pattern, content, re.MULTILINE | re.DOTALL)
|
||||
if match:
|
||||
return match.start(), match.end()
|
||||
return None
|
||||
if not tokens:
|
||||
return None
|
||||
|
||||
content_len = len(content)
|
||||
|
||||
def _try_match_at(start: int) -> int | None:
|
||||
pos = start
|
||||
for is_whitespace, text in tokens:
|
||||
if is_whitespace:
|
||||
if pos >= content_len or not content[pos].isspace():
|
||||
return None
|
||||
pos += 1
|
||||
while pos < content_len and content[pos].isspace():
|
||||
pos += 1
|
||||
else:
|
||||
if not content.startswith(text, pos):
|
||||
return None
|
||||
pos += len(text)
|
||||
return pos
|
||||
|
||||
first_is_whitespace, first_text = tokens[0]
|
||||
|
||||
if not first_is_whitespace:
|
||||
start_search = 0
|
||||
while True:
|
||||
idx = content.find(first_text, start_search)
|
||||
if idx == -1:
|
||||
return None
|
||||
end_pos = _try_match_at(idx)
|
||||
if end_pos is not None:
|
||||
return idx, end_pos
|
||||
start_search = idx + 1
|
||||
else:
|
||||
pos = 0
|
||||
while True:
|
||||
while pos < content_len and not content[pos].isspace():
|
||||
pos += 1
|
||||
if pos >= content_len:
|
||||
return None
|
||||
start_pos = pos
|
||||
end_pos = _try_match_at(start_pos)
|
||||
if end_pos is not None:
|
||||
return start_pos, end_pos
|
||||
pos = start_pos + 1
|
||||
|
||||
|
||||
def apply_patches(diff_str: str, content: str) -> str:
|
||||
|
|
@ -157,7 +202,9 @@ def apply_patches(diff_str: str, content: str) -> str:
|
|||
start_char_idx = content.find(block.search)
|
||||
if start_char_idx != -1:
|
||||
end_char_idx = start_char_idx + len(block.search)
|
||||
content = f"{content[:start_char_idx]}{block.replace}{content[end_char_idx:]}"
|
||||
content = (
|
||||
f"{content[:start_char_idx]}{block.replace}{content[end_char_idx:]}"
|
||||
)
|
||||
else:
|
||||
# Fallback: try whitespace-flexible matching
|
||||
match_result = find_with_whitespace_flexibility(block.search, content)
|
||||
|
|
@ -174,7 +221,9 @@ class SearchAndReplaceDiff(Diff):
|
|||
new_file_to_code: dict[str, str] = {}
|
||||
|
||||
# in deprecated single context we don't know the file name so we ignore it completely
|
||||
single_patch = len(file_to_original_code.keys()) == 1 and len(file_to_diff.keys()) == 1
|
||||
single_patch = (
|
||||
len(file_to_original_code.keys()) == 1 and len(file_to_diff.keys()) == 1
|
||||
)
|
||||
if single_patch and not self.match_files_when_having_single_patch:
|
||||
diffs = list(file_to_diff.values())
|
||||
if diffs:
|
||||
|
|
|
|||
Loading…
Reference in a new issue