fix(java): recover from Maven compilation failures in generated tests

When codeflash generates Java regression tests that fail to compile (e.g.
because the LLM used an unavailable Mockito dependency or incorrect API
calls), the entire test run was previously aborted with "no tests to run",
preventing any baseline from being established for any function in an
--all run.

This commit adds three new helper functions to test_runner.py:
- _extract_failing_java_files(): parses Maven compilation error output
  to extract the file paths of files with errors
- _remove_failing_generated_tests(): deletes codeflash-generated files
  (__perfinstrumented / __perfonlyinstrumented) that have errors; never
  touches original project test files
- _filter_test_paths_excluding_files(): returns a copy of a TestFiles
  object (or list/tuple of paths) with the removed files excluded

These helpers are wired into both _run_maven_tests() and
_run_direct_or_fallback_maven() to automatically remove offending
generated tests and retry, allowing the pre-existing instrumented tests
to still run and establish a baseline.

Also changes the Maven compilation error log from truncated (50 lines)
to full output so failures are easier to diagnose.

22 new unit tests added in test_compilation_failure_recovery.py covering
all three helper functions.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
misrasaurabh1 2026-03-03 04:09:08 +00:00
parent 96b99ea85d
commit 6f533e8253
3 changed files with 425 additions and 14 deletions

View file

@ -47,6 +47,10 @@ _classpath_cache: dict[tuple[Path, str | None], str] = {}
# Allows: letters, digits, underscores, dots, and dollar signs (inner classes)
_VALID_JAVA_CLASS_NAME = re.compile(r"^[a-zA-Z_$][a-zA-Z0-9_$.]*$")
# Regex to extract Java file paths from Maven compilation error output.
# Maven reports errors like: [ERROR] /path/to/File.java:[line,col] message
_MAVEN_COMPILATION_FILE_RE = re.compile(r"\[ERROR\]\s+(/[^\s\[]+\.java):\[\d+,\d+\]")
def _run_cmd_kill_pg_on_timeout(
cmd: list[str],
@ -266,6 +270,90 @@ def _extract_modules_from_pom_content(content: str) -> list[str]:
return [m.text for m in modules_elem if m.text]
def _extract_failing_java_files(output: str) -> set[Path]:
"""Extract Java file paths from Maven compilation error output.
Parses lines like: [ERROR] /path/to/File.java:[line,col] message
Args:
output: Combined stdout+stderr from a Maven run.
Returns:
Set of Path objects for files that have compilation errors.
"""
files: set[Path] = set()
for match in _MAVEN_COMPILATION_FILE_RE.finditer(output):
files.add(Path(match.group(1)))
return files
def _remove_failing_generated_tests(failing_files: set[Path]) -> list[Path]:
"""Remove generated test files that have compilation errors.
Only removes files whose names contain '__perfinstrumented' or
'__perfonlyinstrumented' (i.e., files written by codeflash).
Original source files and manually written tests are never removed.
Args:
failing_files: Set of file paths that have compilation errors.
Returns:
List of paths that were actually removed.
"""
removed: list[Path] = []
for path in failing_files:
if "__perfinstrumented" in path.name or "__perfonlyinstrumented" in path.name:
if path.exists():
path.unlink()
removed.append(path)
logger.info("Removed failing generated test file: %s", path)
return removed
def _filter_test_paths_excluding_files(test_paths: Any, removed_files: list[Path]) -> Any:
"""Return a copy of test_paths with the given files excluded.
Filters out TestFile entries whose instrumented_behavior_file_path or
benchmarking_file_path matches any of the removed files.
Args:
test_paths: TestFiles object or list/tuple of paths.
removed_files: Paths that were removed from disk.
Returns:
Filtered test_paths of the same type.
"""
if not removed_files:
return test_paths
removed_set = {p.resolve() for p in removed_files}
if hasattr(test_paths, "test_files"):
from codeflash.models.models import TestFiles
kept = []
for tf in test_paths.test_files:
behavior_removed = (
tf.instrumented_behavior_file_path is not None
and tf.instrumented_behavior_file_path.resolve() in removed_set
)
bench_removed = (
tf.benchmarking_file_path is not None and tf.benchmarking_file_path.resolve() in removed_set
)
if not behavior_removed and not bench_removed:
kept.append(tf)
return TestFiles(test_files=kept)
if isinstance(test_paths, (list, tuple)):
kept_paths = [p for p in test_paths if not (isinstance(p, Path) and p.resolve() in removed_set)]
return type(test_paths)(kept_paths)
return test_paths
def _validate_test_filter(test_filter: str) -> str:
"""Validate and sanitize a test filter string for Maven.
@ -972,12 +1060,39 @@ def _run_direct_or_fallback_maven(
logger.debug("Step 1: Compiling tests for %s mode", mode)
compile_result = _compile_tests(maven_root, run_env, test_module, timeout=120)
if compile_result.returncode != 0:
logger.warning("Compilation failed (rc=%d), falling back to Maven-based execution", compile_result.returncode)
result = _run_maven_tests(maven_root, test_paths, run_env, timeout=timeout, mode=mode, test_module=test_module)
target_dir = _get_test_module_target_dir(maven_root, test_module)
surefire_dir = target_dir / "surefire-reports"
result_xml_path = _get_combined_junit_xml(surefire_dir, candidate_index)
return result, result_xml_path
# Recovery: if compilation failed, check whether codeflash-generated test files are
# to blame (e.g. LLM generated tests using Mockito or incorrect API calls).
# Remove the offending files and retry compilation so that pre-existing tests can
# still run and establish a baseline.
combined_compile_output = (compile_result.stdout or "") + (compile_result.stderr or "")
failing_files = _extract_failing_java_files(combined_compile_output)
removed = _remove_failing_generated_tests(failing_files)
if removed:
logger.info(
"Removed %d failing generated test file(s) before retry: %s",
len(removed),
[str(r) for r in removed],
)
test_paths = _filter_test_paths_excluding_files(test_paths, removed)
compile_result = _compile_tests(maven_root, run_env, test_module, timeout=120)
if compile_result.returncode != 0:
logger.warning(
"Compilation failed (rc=%d), falling back to Maven-based execution",
compile_result.returncode,
)
result = _run_maven_tests(maven_root, test_paths, run_env, timeout=timeout, mode=mode, test_module=test_module)
target_dir = _get_test_module_target_dir(maven_root, test_module)
surefire_dir = target_dir / "surefire-reports"
result_xml_path = _get_combined_junit_xml(surefire_dir, candidate_index)
return result, result_xml_path
# Re-extract test classes after filtering
test_classes = _get_test_class_names(test_paths, mode=mode)
if not test_classes:
logger.warning("No valid test classes remain after removing failing generated tests")
result_xml_path, empty_result = _get_empty_result(maven_root, test_module)
return empty_result, result_xml_path
# Step 2: Get classpath (cached after first call)
logger.debug("Step 2: Getting classpath")
@ -1617,10 +1732,39 @@ def _run_maven_tests(
mode,
result.returncode,
)
# Log first 50 lines of output to help diagnose compilation errors
output_lines = combined_output.split("\n")
error_context = "\n".join(output_lines[:50]) if len(output_lines) > 50 else combined_output
logger.error("Maven compilation error output:\n%s", error_context)
# Log full error output for diagnostics
logger.error("Maven compilation error output:\n%s", combined_output)
# Recovery: remove failing generated test files and retry.
# LLM-generated tests may use unavailable dependencies (e.g. Mockito)
# or incorrect API calls that prevent compilation. Removing them allows
# existing (pre-instrumented) tests to still run and establish a baseline.
failing_files = _extract_failing_java_files(combined_output)
removed = _remove_failing_generated_tests(failing_files)
if removed:
logger.info(
"Removed %d failing generated test file(s), retrying Maven run: %s",
len(removed),
[str(r) for r in removed],
)
# Rebuild test filter without the removed files
filtered_paths = _filter_test_paths_excluding_files(test_paths, removed)
retry_filter = _build_test_filter(filtered_paths, mode=mode)
if retry_filter:
retry_cmd = [c for c in cmd if not c.startswith("-Dtest=")]
retry_cmd.append(f"-Dtest={_validate_test_filter(retry_filter)}")
result = _run_cmd_kill_pg_on_timeout(
retry_cmd, cwd=project_root, env=env, timeout=timeout
)
if result.returncode != 0:
retry_output = (result.stdout or "") + (result.stderr or "")
logger.error(
"Maven still failed after removing %d generated test files: %s",
len(removed),
retry_output[:500],
)
else:
logger.warning("No valid tests remain after removing failing generated tests")
return result

View file

@ -0,0 +1,267 @@
"""Tests for compilation failure recovery helpers in the Java test runner.
These helpers parse Maven error output to detect failing generated test files,
remove them from disk, and filter them out of test_paths objects, enabling
a retry without the problematic files.
"""
from __future__ import annotations
from pathlib import Path
from codeflash.languages.java.test_runner import (
_extract_failing_java_files,
_filter_test_paths_excluding_files,
_remove_failing_generated_tests,
)
# ---------------------------------------------------------------------------
# _extract_failing_java_files
# ---------------------------------------------------------------------------
class TestExtractFailingJavaFiles:
"""Tests for _extract_failing_java_files."""
def test_empty_output(self) -> None:
"""No files in empty output."""
assert _extract_failing_java_files("") == set()
def test_single_error(self) -> None:
"""Single Maven compilation error parsed correctly."""
output = (
"[ERROR] /home/user/project/test/src/FooTest__perfinstrumented.java:[42,13] "
"cannot find symbol\n"
)
result = _extract_failing_java_files(output)
assert result == {Path("/home/user/project/test/src/FooTest__perfinstrumented.java")}
def test_multiple_errors_same_file(self) -> None:
"""Multiple errors in the same file → only one entry returned."""
output = (
"[ERROR] /home/user/Foo__perfinstrumented.java:[10,5] error: cannot find symbol\n"
"[ERROR] /home/user/Foo__perfinstrumented.java:[20,5] error: package x does not exist\n"
)
result = _extract_failing_java_files(output)
assert result == {Path("/home/user/Foo__perfinstrumented.java")}
def test_multiple_different_files(self) -> None:
"""Multiple distinct failing files all collected."""
output = (
"[ERROR] /a/Foo__perfinstrumented.java:[1,1] bad stuff\n"
"[ERROR] /b/Bar__perfonlyinstrumented.java:[2,2] bad stuff\n"
"[ERROR] /c/Baz__perfinstrumented.java:[3,3] bad stuff\n"
)
result = _extract_failing_java_files(output)
assert result == {
Path("/a/Foo__perfinstrumented.java"),
Path("/b/Bar__perfonlyinstrumented.java"),
Path("/c/Baz__perfinstrumented.java"),
}
def test_non_java_errors_ignored(self) -> None:
"""Lines that are not Java file compilation errors are ignored."""
output = (
"[ERROR] Some other Maven error\n"
"[ERROR] BUILD FAILURE\n"
"[INFO] This is not an error\n"
)
assert _extract_failing_java_files(output) == set()
def test_ignores_non_generated_files(self) -> None:
"""Non-generated .java files are returned too (caller decides what to remove)."""
output = "[ERROR] /src/main/java/Regular.java:[5,3] error\n"
result = _extract_failing_java_files(output)
# _extract returns ALL failing files; _remove only deletes generated ones
assert result == {Path("/src/main/java/Regular.java")}
def test_column_zero(self) -> None:
"""Column 0 is a valid position."""
output = "[ERROR] /home/user/Test__perfinstrumented.java:[0,0] error\n"
result = _extract_failing_java_files(output)
assert result == {Path("/home/user/Test__perfinstrumented.java")}
def test_path_with_spaces_not_matched(self) -> None:
"""Paths with spaces in them would break the regex (by design — Maven doesn't use them)."""
output = "[ERROR] /home/user/path with spaces/Foo.java:[1,1] error\n"
# The regex stops at whitespace so the path would be wrong — it's fine,
# we just check it doesn't crash.
result = _extract_failing_java_files(output)
assert isinstance(result, set)
# ---------------------------------------------------------------------------
# _remove_failing_generated_tests
# ---------------------------------------------------------------------------
class TestRemoveFailingGeneratedTests:
"""Tests for _remove_failing_generated_tests."""
def test_empty_set(self) -> None:
"""Empty input → nothing removed."""
assert _remove_failing_generated_tests(set()) == []
def test_removes_perfinstrumented_file(self, tmp_path: Path) -> None:
"""Files with __perfinstrumented in the name are deleted."""
f = tmp_path / "Foo__perfinstrumented.java"
f.write_text("// generated\n")
removed = _remove_failing_generated_tests({f})
assert removed == [f]
assert not f.exists()
def test_removes_perfonlyinstrumented_file(self, tmp_path: Path) -> None:
"""Files with __perfonlyinstrumented in the name are deleted."""
f = tmp_path / "Foo__perfonlyinstrumented.java"
f.write_text("// generated\n")
removed = _remove_failing_generated_tests({f})
assert removed == [f]
assert not f.exists()
def test_does_not_remove_regular_files(self, tmp_path: Path) -> None:
"""Regular (non-generated) files are never removed."""
f = tmp_path / "RegularTest.java"
f.write_text("// real test\n")
removed = _remove_failing_generated_tests({f})
assert removed == []
assert f.exists()
def test_does_not_remove_nonexistent_files(self, tmp_path: Path) -> None:
"""Non-existent generated files don't raise — just skipped."""
f = tmp_path / "Ghost__perfinstrumented.java"
assert not f.exists()
removed = _remove_failing_generated_tests({f})
assert removed == []
def test_mixed_files(self, tmp_path: Path) -> None:
"""Only generated files among a mixed set are removed."""
gen = tmp_path / "Gen__perfinstrumented.java"
gen.write_text("// gen\n")
real = tmp_path / "RealTest.java"
real.write_text("// real\n")
removed = _remove_failing_generated_tests({gen, real})
# Only the generated file is removed (set input, so order may vary)
assert set(removed) == {gen}
assert not gen.exists()
assert real.exists()
# ---------------------------------------------------------------------------
# _filter_test_paths_excluding_files
# ---------------------------------------------------------------------------
class TestFilterTestPathsExcludingFiles:
"""Tests for _filter_test_paths_excluding_files."""
def test_empty_removed_files_returns_same(self) -> None:
"""If nothing was removed, test_paths is returned unchanged."""
paths = [Path("/a"), Path("/b")]
result = _filter_test_paths_excluding_files(paths, [])
assert result is paths
def test_filters_list_of_paths(self, tmp_path: Path) -> None:
"""A plain list of Path objects is filtered correctly."""
a = tmp_path / "a.java"
b = tmp_path / "b.java"
a.write_text("")
b.write_text("")
result = _filter_test_paths_excluding_files([a, b], [a])
assert result == [b]
def test_filters_tuple_of_paths(self, tmp_path: Path) -> None:
"""A tuple of Path objects is filtered and returned as a tuple."""
a = tmp_path / "a.java"
b = tmp_path / "b.java"
a.write_text("")
b.write_text("")
result = _filter_test_paths_excluding_files((a, b), [a])
assert isinstance(result, tuple)
assert result == (b,)
def test_filters_testfiles_by_behavior_path(self, tmp_path: Path) -> None:
"""TestFiles entries whose behavior path matches are removed."""
from codeflash.models.models import TestFile, TestFiles, TestType
behavior = tmp_path / "Foo__perfinstrumented.java"
bench = tmp_path / "Foo__perfonlyinstrumented.java"
behavior.write_text("")
bench.write_text("")
tf = TestFile(
instrumented_behavior_file_path=behavior,
benchmarking_file_path=bench,
original_file_path=None,
original_source="",
test_type=TestType.GENERATED_REGRESSION,
tests_in_file=None,
)
test_paths = TestFiles(test_files=[tf])
result = _filter_test_paths_excluding_files(test_paths, [behavior])
assert len(result.test_files) == 0
def test_filters_testfiles_by_bench_path(self, tmp_path: Path) -> None:
"""TestFiles entries whose bench path matches are removed."""
from codeflash.models.models import TestFile, TestFiles, TestType
behavior = tmp_path / "Foo__perfinstrumented.java"
bench = tmp_path / "Foo__perfonlyinstrumented.java"
behavior.write_text("")
bench.write_text("")
tf = TestFile(
instrumented_behavior_file_path=behavior,
benchmarking_file_path=bench,
original_file_path=None,
original_source="",
test_type=TestType.GENERATED_REGRESSION,
tests_in_file=None,
)
test_paths = TestFiles(test_files=[tf])
result = _filter_test_paths_excluding_files(test_paths, [bench])
assert len(result.test_files) == 0
def test_keeps_unaffected_testfiles(self, tmp_path: Path) -> None:
"""TestFiles entries not matching removed files are kept."""
from codeflash.models.models import TestFile, TestFiles, TestType
good_behavior = tmp_path / "Good__perfinstrumented.java"
bad_behavior = tmp_path / "Bad__perfinstrumented.java"
good_behavior.write_text("")
bad_behavior.write_text("")
good_tf = TestFile(
instrumented_behavior_file_path=good_behavior,
benchmarking_file_path=None,
original_file_path=None,
original_source="",
test_type=TestType.GENERATED_REGRESSION,
tests_in_file=None,
)
bad_tf = TestFile(
instrumented_behavior_file_path=bad_behavior,
benchmarking_file_path=None,
original_file_path=None,
original_source="",
test_type=TestType.GENERATED_REGRESSION,
tests_in_file=None,
)
test_paths = TestFiles(test_files=[good_tf, bad_tf])
result = _filter_test_paths_excluding_files(test_paths, [bad_behavior])
assert len(result.test_files) == 1
assert result.test_files[0].instrumented_behavior_file_path == good_behavior
def test_unknown_type_returned_unchanged(self) -> None:
"""Unknown test_paths types are returned as-is."""
obj = {"some": "dict"}
result = _filter_test_paths_excluding_files(obj, [Path("/foo")])
assert result is obj
def test_path_resolution_used_for_comparison(self, tmp_path: Path) -> None:
"""Comparison uses resolved paths so relative vs absolute matches correctly."""
f = tmp_path / "Foo__perfinstrumented.java"
f.write_text("")
# Pass a relative path equivalent (resolved in _filter by .resolve())
result = _filter_test_paths_excluding_files([f], [f.resolve()])
assert result == []

View file

@ -4,9 +4,9 @@ from pathlib import Path
import pytest
from codeflash.languages.base import Language, LanguageSupport
from codeflash.languages.java.support import JavaSupport, get_java_support
from codeflash.discovery.functions_to_optimize import get_files_for_language
from codeflash.languages.base import Language, LanguageSupport
from codeflash.languages.java.support import get_java_support
class TestJavaSupportProtocol:
@ -143,11 +143,11 @@ class TestJavaDirExcludes:
return get_java_support()
def test_dir_excludes_contains_apidocs(self, support):
"""apidocs (generated Javadoc HTML) must not be walked during --all."""
"""Apidocs (generated Javadoc HTML) must not be walked during --all."""
assert "apidocs" in support.dir_excludes
def test_dir_excludes_contains_javadoc(self, support):
"""javadoc directory must not be walked during --all."""
"""Javadoc directory must not be walked during --all."""
assert "javadoc" in support.dir_excludes
def test_apidocs_js_files_not_discovered(self, tmp_path: Path):