fix loop count issue among subsequent tests

This commit is contained in:
Sarthak Agarwal 2026-02-09 20:21:37 +05:30
parent c233a371a9
commit 410aca7b83
6 changed files with 136 additions and 4 deletions

View file

@ -175,6 +175,17 @@ def parse_jest_test_xml(
logger.debug(f"Found {marker_count} timing start markers in Jest stdout")
else:
logger.debug(f"No timing start markers found in Jest stdout (len={len(global_stdout)})")
# Check for END markers with duration (perf test markers)
end_marker_count = len(jest_end_pattern.findall(global_stdout))
if end_marker_count > 0:
logger.debug(f"[PERF-DEBUG] Found {end_marker_count} END timing markers with duration in Jest stdout")
# Sample a few markers to verify loop indices
end_samples = list(jest_end_pattern.finditer(global_stdout))[:5]
for sample in end_samples:
groups = sample.groups()
logger.debug(f"[PERF-DEBUG] Sample END marker: loopIndex={groups[3]}, duration={groups[5]}")
else:
logger.debug(f"[PERF-DEBUG] No END markers with duration found in Jest stdout")
except (AttributeError, UnicodeDecodeError):
global_stdout = ""
@ -197,6 +208,12 @@ def parse_jest_test_xml(
key = match.groups()[:5]
end_matches_dict[key] = match
# Debug: log suite-level END marker parsing for perf tests
if end_matches_dict:
# Get unique loop indices from the parsed END markers
loop_indices = sorted(set(int(k[3]) if k[3].isdigit() else 1 for k in end_matches_dict.keys()))
logger.debug(f"[PERF-DEBUG] Suite {suite_count}: parsed {len(end_matches_dict)} END markers from suite_stdout, loop_index range: {min(loop_indices)}-{max(loop_indices)}")
# Also collect timing markers from testcase-level system-out (Vitest puts output at testcase level)
for tc in suite:
tc_system_out = tc._elem.find("system-out") # noqa: SLF001
@ -327,6 +344,13 @@ def parse_jest_test_xml(
sanitized_test_name = re.sub(r"[!#: ()\[\]{}|\\/*?^$.+\-]", "_", test_name)
matching_starts = [m for m in start_matches if sanitized_test_name in m.group(2)]
# Debug: log which branch we're taking
logger.debug(
f"[FLOW-DEBUG] Testcase '{test_name[:50]}': "
f"total_start_matches={len(start_matches)}, matching_starts={len(matching_starts)}, "
f"total_end_matches={len(end_matches_dict)}"
)
# For performance tests (capturePerf), there are no START markers - only END markers with duration
# Check for END markers directly if no START markers found
matching_ends_direct = []
@ -337,6 +361,28 @@ def parse_jest_test_xml(
# end_key is (module, testName, funcName, loopIndex, invocationId)
if len(end_key) >= 2 and sanitized_test_name in end_key[1]:
matching_ends_direct.append(end_match)
# Debug: log matching results for perf tests
if matching_ends_direct:
loop_indices = [int(m.groups()[3]) if m.groups()[3].isdigit() else 1 for m in matching_ends_direct]
logger.debug(
f"[PERF-MATCH] Testcase '{test_name[:40]}': matched {len(matching_ends_direct)} END markers, "
f"loop_index range: {min(loop_indices)}-{max(loop_indices)}"
)
elif end_matches_dict:
# No matches but we have END markers - check why
sample_keys = list(end_matches_dict.keys())[:3]
logger.debug(
f"[PERF-MISMATCH] Testcase '{test_name[:40]}': no matches found. "
f"sanitized_test_name='{sanitized_test_name[:50]}', "
f"sample end_keys={[k[1][:30] if len(k) >= 2 else k for k in sample_keys]}"
)
# Log if we're skipping the matching_ends_direct branch
if matching_starts and end_matches_dict:
logger.debug(
f"[FLOW-SKIP] Testcase '{test_name[:40]}': has {len(matching_starts)} START markers, "
f"skipping {len(end_matches_dict)} END markers (behavior test mode)"
)
if not matching_starts and not matching_ends_direct:
# No timing markers found - use JUnit XML time attribute as fallback
@ -373,11 +419,13 @@ def parse_jest_test_xml(
)
elif matching_ends_direct:
# Performance test format: process END markers directly (no START markers)
loop_indices_found = []
for end_match in matching_ends_direct:
groups = end_match.groups()
# groups: (module, testName, funcName, loopIndex, invocationId, durationNs)
func_name = groups[2]
loop_index = int(groups[3]) if groups[3].isdigit() else 1
loop_indices_found.append(loop_index)
line_id = groups[4]
try:
runtime = int(groups[5])
@ -403,6 +451,12 @@ def parse_jest_test_xml(
stdout="",
)
)
if loop_indices_found:
logger.debug(
f"[LOOP-DEBUG] Testcase '{test_name}': processed {len(matching_ends_direct)} END markers, "
f"loop_index range: {min(loop_indices_found)}-{max(loop_indices_found)}, "
f"total results so far: {len(test_results.test_results)}"
)
else:
# Process each timing marker
for match in matching_starts:
@ -454,5 +508,19 @@ def parse_jest_test_xml(
f"Jest XML parsing complete: {len(test_results.test_results)} results "
f"from {suite_count} suites, {testcase_count} testcases"
)
# Debug: show loop_index distribution for perf analysis
if test_results.test_results:
loop_indices = [r.loop_index for r in test_results.test_results]
unique_loop_indices = sorted(set(loop_indices))
min_idx, max_idx = min(unique_loop_indices), max(unique_loop_indices)
logger.debug(
f"[LOOP-SUMMARY] Results loop_index: min={min_idx}, max={max_idx}, "
f"unique_count={len(unique_loop_indices)}, total_results={len(loop_indices)}"
)
if max_idx == 1 and len(loop_indices) > 1:
logger.warning(
f"[LOOP-WARNING] All {len(loop_indices)} results have loop_index=1. "
"Perf test markers may not have been parsed correctly."
)
return test_results

View file

@ -2134,6 +2134,7 @@ class JavaScriptSupport:
from codeflash.languages.test_framework import get_js_test_framework_or_default
framework = test_framework or get_js_test_framework_or_default()
logger.debug(f"run_benchmarking_tests called with framework={framework}")
# Use JS-specific high max_loops - actual loop count is limited by target_duration
effective_max_loops = self.JS_BENCHMARKING_MAX_LOOPS
@ -2141,6 +2142,7 @@ class JavaScriptSupport:
if framework == "vitest":
from codeflash.languages.javascript.vitest_runner import run_vitest_benchmarking_tests
logger.debug("Dispatching to run_vitest_benchmarking_tests")
return run_vitest_benchmarking_tests(
test_paths=test_paths,
test_env=test_env,

View file

@ -520,6 +520,9 @@ def run_vitest_benchmarking_tests(
) -> tuple[Path, subprocess.CompletedProcess]:
"""Run Vitest benchmarking tests with external looping from Python.
NOTE: This function MUST use benchmarking_file_path (perf tests with capturePerf),
NOT instrumented_behavior_file_path (behavior tests with capture).
Uses external process-level looping to run tests multiple times and
collect timing data. This matches the Python pytest approach where
looping is controlled externally for simplicity.
@ -544,6 +547,22 @@ def run_vitest_benchmarking_tests(
# Get performance test files
test_files = [Path(file.benchmarking_file_path) for file in test_paths.test_files if file.benchmarking_file_path]
# Log test file selection
total_test_files = len(test_paths.test_files)
perf_test_files = len(test_files)
logger.debug(f"Vitest benchmark test file selection: {perf_test_files}/{total_test_files} have benchmarking_file_path")
if perf_test_files == 0:
logger.warning("No perf test files found! Cannot run benchmarking tests.")
for tf in test_paths.test_files:
logger.warning(f"Test file: behavior={tf.instrumented_behavior_file_path}, perf={tf.benchmarking_file_path}")
elif perf_test_files < total_test_files:
for tf in test_paths.test_files:
if not tf.benchmarking_file_path:
logger.warning(f"Missing benchmarking_file_path: behavior={tf.instrumented_behavior_file_path}")
else:
for tf in test_files[:3]: # Log first 3 perf test files
logger.debug(f"Using perf test file: {tf}")
# Use provided project_root, or detect it as fallback
if project_root is None and test_files:
project_root = _find_vitest_project_root(test_files[0])
@ -574,14 +593,21 @@ def run_vitest_benchmarking_tests(
vitest_env["CODEFLASH_PERF_STABILITY_CHECK"] = "true" if stability_check else "false"
vitest_env["CODEFLASH_LOOP_INDEX"] = "1"
# Set test module for marker identification (use first test file as reference)
if test_files:
test_module_path = str(test_files[0].relative_to(effective_cwd) if test_files[0].is_relative_to(effective_cwd) else test_files[0].name)
vitest_env["CODEFLASH_TEST_MODULE"] = test_module_path
logger.debug(f"[VITEST-BENCH] Set CODEFLASH_TEST_MODULE={test_module_path}")
# Total timeout for the entire benchmark run
total_timeout = max(120, (target_duration_ms // 1000) + 60, timeout or 120)
logger.debug(f"Running Vitest benchmarking tests: {' '.join(vitest_cmd)}")
logger.debug(f"[VITEST-BENCH] Running Vitest benchmarking tests: {' '.join(vitest_cmd)}")
logger.debug(
f"Vitest benchmarking config: min_loops={min_loops}, max_loops={max_loops}, "
f"[VITEST-BENCH] Config: min_loops={min_loops}, max_loops={max_loops}, "
f"target_duration={target_duration_ms}ms, stability_check={stability_check}"
)
logger.debug(f"[VITEST-BENCH] Environment: CODEFLASH_PERF_LOOP_COUNT={vitest_env.get('CODEFLASH_PERF_LOOP_COUNT')}")
total_start_time = time.time()
@ -606,7 +632,26 @@ def run_vitest_benchmarking_tests(
result = subprocess.CompletedProcess(args=vitest_cmd, returncode=-1, stdout="", stderr="Vitest not found")
wall_clock_seconds = time.time() - total_start_time
logger.debug(f"Vitest benchmarking completed in {wall_clock_seconds:.2f}s")
logger.debug(f"[VITEST-BENCH] Completed in {wall_clock_seconds:.2f}s, returncode={result.returncode}")
# Debug: Check for END markers with duration (perf test format)
if result.stdout:
import re
perf_end_pattern = re.compile(r"!######[^:]+:[^:]+:[^:]+:(\d+):[^:]+:(\d+)######!")
perf_matches = list(perf_end_pattern.finditer(result.stdout))
if perf_matches:
loop_indices = [int(m.group(1)) for m in perf_matches]
logger.debug(
f"[VITEST-BENCH] Found {len(perf_matches)} perf END markers in stdout, "
f"loop_index range: {min(loop_indices)}-{max(loop_indices)}"
)
else:
logger.debug(f"[VITEST-BENCH] No perf END markers found in stdout (len={len(result.stdout)})")
# Check if there are behavior END markers instead
behavior_end_pattern = re.compile(r"!######[^:]+:[^:]+:[^:]+:\d+:[^#]+######!")
behavior_matches = list(behavior_end_pattern.finditer(result.stdout))
if behavior_matches:
logger.debug(f"[VITEST-BENCH] Found {len(behavior_matches)} behavior END markers instead (no duration)")
return result_file_path, result

View file

@ -2363,6 +2363,10 @@ class FunctionOptimizer:
)
console.rule()
with progress_bar("Running performance benchmarks..."):
logger.debug(f"[BENCHMARK-START] Starting benchmarking tests with {len(self.test_files.test_files)} test files")
for idx, tf in enumerate(self.test_files.test_files):
logger.debug(f"[BENCHMARK-FILES] Test file {idx}: perf_file={tf.benchmarking_file_path}")
if self.function_to_optimize.is_async and is_python():
from codeflash.code_utils.instrument_existing_tests import add_async_decorator_to_function
@ -2380,6 +2384,7 @@ class FunctionOptimizer:
enable_coverage=False,
code_context=code_context,
)
logger.debug(f"[BENCHMARK-DONE] Got {len(benchmarking_results.test_results)} benchmark results")
finally:
if self.function_to_optimize.is_async:
self.write_code_and_helpers(

View file

@ -325,6 +325,7 @@ def run_benchmarking_tests(
pytest_max_loops: int = 100_000,
js_project_root: Path | None = None,
) -> tuple[Path, subprocess.CompletedProcess]:
logger.debug(f"run_benchmarking_tests called: framework={test_framework}, num_files={len(test_paths.test_files)}")
# Check if there's a language support for this test framework that implements run_benchmarking_tests
language_support = get_language_support_by_framework(test_framework)
if language_support is not None and hasattr(language_support, "run_benchmarking_tests"):

View file

@ -839,7 +839,7 @@ function setTestName(name) {
resetInvocationCounters();
}
// Jest lifecycle hooks - these run automatically when this module is imported
// Jest/Vitest lifecycle hooks - these run automatically when this module is imported
if (typeof beforeEach !== 'undefined') {
beforeEach(() => {
// Get current test name and path from Jest's expect state
@ -854,6 +854,17 @@ if (typeof beforeEach !== 'undefined') {
}
// Reset invocation counters for each test
resetInvocationCounters();
// For Vitest (no external loop-runner), reset perf state for each test
// so each test gets its own time budget for internal looping.
// For Jest with loop-runner, CODEFLASH_PERF_CURRENT_BATCH is set,
// and we want shared state across the test file.
const hasExternalLoopRunner = process.env.CODEFLASH_PERF_CURRENT_BATCH !== undefined;
if (!hasExternalLoopRunner) {
resetPerfState();
// Also reset invocation loop counts so each test starts fresh
sharedPerfState.invocationLoopCounts = {};
}
});
}