Fix pytest 9 compat, addopts conflicts, XML recovery, and diagnostics

- Use getattr for rootdir/rootpath in discovery worker (pytest 9 compat) - Add -o addopts= to all pytest invocations to override project config - Extract _base_pytest_args helper to eliminate duplication across runners - Support [tool.pytest] config section (not just [tool.pytest.ini_options]) - Add --dist, --no-flaky-report, --failed-first to BLACKLIST_ADDOPTS - Add recover=True to XMLParser for malformed JUnit XML tolerance - Log subprocess stdout/stderr on baseline and candidate test failures - Friendly warning when GitHub App not installed instead of raw error - Upgrade repair failure logging from debug to warning
2026-05-04 18:25:19 +00:00 · 2026-04-21 20:41:43 -05:00 · 2026-04-21 20:41:43 -05:00 · cefd625d35
commit cefd625d35
parent 17de71251f
8 changed files with 61 additions and 24 deletions
--- a/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py
+++ b/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py
@ -57,7 +57,7 @@ class PytestCollectionPlugin:
        global pytest_rootdir, collected_tests

        collected_tests.extend(session.items)
-        pytest_rootdir = session.config.rootdir
+        pytest_rootdir = getattr(session.config, "rootdir", None) or getattr(session.config, "rootpath", None)

        # Write results immediately since pytest.main() will exit after
        # this callback, not always with a success code.
@ -87,6 +87,7 @@ if __name__ == "__main__":
                tests_root,
                "-p",
                "no:logging",
+                "-o", "addopts=",
                "--collect-only",
                "-m",
                "not skip",
--- a/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py
+++ b/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py
@ -149,6 +149,13 @@ def check_create_pr(  # noqa: PLR0913
                pr_id,
                pr_url,
            )
+        elif response.status_code == 401 and "not installed" in response.text:  # noqa: PLR2004
+            log.warning(
+                "Could not create PR: the Codeflash GitHub App is not installed on %s/%s. "
+                "Install it at https://github.com/apps/codeflash-ai to enable automatic PRs.",
+                owner,
+                repo,
+            )
        else:
            log.error(
                "Failed to create PR: %s",
--- a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py
+++ b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py
@ -172,6 +172,14 @@ def run_tests_and_benchmark(  # noqa: PLR0913
            cid,
            len(diffs),
        )
+        if not diffs and run_result is not None and run_result.returncode != 0:
+            log.warning(
+                "Candidate %s test subprocess crashed (exit code %s).\nSTDOUT: %.2000s\nSTDERR: %.2000s",
+                cid,
+                run_result.returncode,
+                run_result.stdout,
+                run_result.stderr,
+            )
        eval_ctx.record_failed(cid)
        # Store diffs for potential code repair.
        if diffs:
--- a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py
+++ b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py
@ -332,7 +332,7 @@ def repair_failed_candidates(
        try:
            result = code_repair(ai_client, request)
        except Exception:  # noqa: BLE001
-            log.debug(
+            log.warning(
                "Repair failed for candidate %s",
                cid,
                exc_info=True,
--- a/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py
+++ b/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py
@ -22,6 +22,9 @@ BLACKLIST_ADDOPTS: tuple[str, ...] = (
    "--cov",
    "--profile",
    "--junitxml",
+    "--dist",
+    "--no-flaky-report",
+    "--failed-first",
    "-n",
 )

@ -96,14 +99,14 @@ def modify_addopts(
    try:
        if filename == "pyproject.toml":
            data = tomlkit.parse(content)
+            pytest_section = data.get("tool", {}).get("pytest", {})
            original_addopts = (
-                data.get("tool", {})
-                .get("pytest", {})
-                .get("ini_options", {})
-                .get("addopts", "")
+                pytest_section.get("ini_options", {}).get("addopts", "")
+                or pytest_section.get("addopts", "")
            )
            if original_addopts == "":
                return content, False
+            uses_ini_options = "ini_options" in pytest_section and "addopts" in pytest_section.get("ini_options", {})
            if isinstance(original_addopts, list):
                original_addopts = " ".join(original_addopts)
            original_addopts = original_addopts.replace("=", " ")
@ -134,9 +137,14 @@ def modify_addopts(
        if new_addopts_args == addopts_args:
            return content, False
        if file_type == ".toml":
+            if uses_ini_options:
                data["tool"]["pytest"]["ini_options"]["addopts"] = (  # type: ignore[index]
                    " ".join(new_addopts_args)
                )
+            else:
+                data["tool"]["pytest"]["addopts"] = (  # type: ignore[index]
+                    " ".join(new_addopts_args)
+                )
            with config_file.open("w", encoding="utf-8") as f:
                f.write(tomlkit.dumps(data))
                return content, True
--- a/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py
+++ b/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py
@ -22,6 +22,16 @@ _PER_FILE_TIMEOUT = 60
 _MAX_TIMEOUT = 600


+def _base_pytest_args(rootdir: Path | None, cwd: Path) -> list[str]:
+    """Common pytest args shared across all test runner functions."""
+    return [
+        "--capture=tee-sys",
+        "-q",
+        f"--rootdir={rootdir or cwd}",
+        "-o", "addopts=",
+    ]
+
+
 def _subprocess_timeout(num_test_files: int) -> int:
    """Compute subprocess timeout from the number of test files."""
    return min(_BASE_TIMEOUT + _PER_FILE_TIMEOUT * num_test_files, _MAX_TIMEOUT)
@ -105,9 +115,7 @@ def run_behavioral_tests(  # noqa: PLR0913
        *shlex.split(pytest_cmd),
    ]
    common_args = [
-        "--capture=tee-sys",
-        "-q",
-        f"--rootdir={rootdir or cwd}",
+        *_base_pytest_args(rootdir, cwd),
        "--codeflash_loops_scope=session",
        "--codeflash_min_loops=1",
        "--codeflash_max_loops=1",
@ -226,9 +234,7 @@ def run_benchmarking_tests(  # noqa: PLR0913
    )

    pytest_args = [
-        "--capture=tee-sys",
-        "-q",
-        f"--rootdir={rootdir or cwd}",
+        *_base_pytest_args(rootdir, cwd),
        "--codeflash_loops_scope=session",
        f"--codeflash_min_loops={min_loops}",
        f"--codeflash_max_loops={max_loops}",
@ -299,9 +305,7 @@ def run_line_profile_tests(  # noqa: PLR0913
    )

    pytest_args = [
-        "--capture=tee-sys",
-        "-q",
-        f"--rootdir={rootdir or cwd}",
+        *_base_pytest_args(rootdir, cwd),
        "--codeflash_loops_scope=session",
        "--codeflash_min_loops=1",
        "--codeflash_max_loops=1",
--- a/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py
+++ b/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py
@ -48,13 +48,13 @@ matches_re_end = re.compile(


 def _parse_func(file_path):  # type: ignore[no-untyped-def]
-    """XML parser with huge_tree=True to handle large JUnit XML files."""
+    """XML parser with huge_tree and recover to handle large/malformed JUnit XML."""
    from lxml.etree import (  # type: ignore[import-untyped]  # noqa: PLC0415
        XMLParser,
        parse,
    )

-    xml_parser = XMLParser(huge_tree=True)
+    xml_parser = XMLParser(huge_tree=True, recover=True)
    return parse(file_path, xml_parser)


--- a/packages/codeflash-python/src/codeflash_python/verification/_baseline.py
+++ b/packages/codeflash-python/src/codeflash_python/verification/_baseline.py
@ -236,6 +236,15 @@ def establish_original_code_baseline(  # noqa: PLR0913
        )

    if not behavioral_results:
+        if run_result is not None:
+            log.warning(
+                "No behavioral test results for original code (exit code %s). "
+                "Skipping baseline establishment.\nSTDOUT: %.2000s\nSTDERR: %.2000s",
+                run_result.returncode,
+                run_result.stdout,
+                run_result.stderr,
+            )
+        else:
            log.warning(
                "No behavioral test results for original code. "
                "Skipping baseline establishment.",