From cefd625d353be3f2283a53d7df61728ece0116ba Mon Sep 17 00:00:00 2001
From: Kevin Turcios <turcioskevinr@gmail.com>
Date: Tue, 21 Apr 2026 20:41:43 -0500
Subject: [PATCH] Fix pytest 9 compat, addopts conflicts, XML recovery, and
 diagnostics

- Use getattr for rootdir/rootpath in discovery worker (pytest 9 compat)
- Add -o addopts= to all pytest invocations to override project config
- Extract _base_pytest_args helper to eliminate duplication across runners
- Support [tool.pytest] config section (not just [tool.pytest.ini_options])
- Add --dist, --no-flaky-report, --failed-first to BLACKLIST_ADDOPTS
- Add recover=True to XMLParser for malformed JUnit XML tolerance
- Log subprocess stdout/stderr on baseline and candidate test failures
- Friendly warning when GitHub App not installed instead of raw error
- Upgrade repair failure logging from debug to warning
---
 .../analysis/_discovery_worker.py             |  3 ++-
 .../codeflash_python/codegen/_create_pr.py    |  7 ++++++
 .../pipeline/_candidate_eval.py               |  8 +++++++
 .../pipeline/_candidate_gen.py                |  2 +-
 .../testing/_pytest_config.py                 | 22 +++++++++++++------
 .../codeflash_python/testing/_test_runner.py  | 22 +++++++++++--------
 .../codeflash_python/testing/_xml_parser.py   |  4 ++--
 .../verification/_baseline.py                 | 17 ++++++++++----
 8 files changed, 61 insertions(+), 24 deletions(-)

diff --git a/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py b/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py
index 4ec4d06..74a6af1 100644
--- a/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py
+++ b/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py
@@ -57,7 +57,7 @@ class PytestCollectionPlugin:
         global pytest_rootdir, collected_tests
 
         collected_tests.extend(session.items)
-        pytest_rootdir = session.config.rootdir
+        pytest_rootdir = getattr(session.config, "rootdir", None) or getattr(session.config, "rootpath", None)
 
         # Write results immediately since pytest.main() will exit after
         # this callback, not always with a success code.
@@ -87,6 +87,7 @@ if __name__ == "__main__":
                 tests_root,
                 "-p",
                 "no:logging",
+                "-o", "addopts=",
                 "--collect-only",
                 "-m",
                 "not skip",
diff --git a/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py b/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py
index 2541416..0809961 100644
--- a/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py
+++ b/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py
@@ -149,6 +149,13 @@ def check_create_pr(  # noqa: PLR0913
                 pr_id,
                 pr_url,
             )
+        elif response.status_code == 401 and "not installed" in response.text:  # noqa: PLR2004
+            log.warning(
+                "Could not create PR: the Codeflash GitHub App is not installed on %s/%s. "
+                "Install it at https://github.com/apps/codeflash-ai to enable automatic PRs.",
+                owner,
+                repo,
+            )
         else:
             log.error(
                 "Failed to create PR: %s",
diff --git a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py
index d8be475..b027e9e 100644
--- a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py
+++ b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py
@@ -172,6 +172,14 @@ def run_tests_and_benchmark(  # noqa: PLR0913
             cid,
             len(diffs),
         )
+        if not diffs and run_result is not None and run_result.returncode != 0:
+            log.warning(
+                "Candidate %s test subprocess crashed (exit code %s).\nSTDOUT: %.2000s\nSTDERR: %.2000s",
+                cid,
+                run_result.returncode,
+                run_result.stdout,
+                run_result.stderr,
+            )
         eval_ctx.record_failed(cid)
         # Store diffs for potential code repair.
         if diffs:
diff --git a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py
index 6814441..f2766e8 100644
--- a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py
+++ b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py
@@ -332,7 +332,7 @@ def repair_failed_candidates(
         try:
             result = code_repair(ai_client, request)
         except Exception:  # noqa: BLE001
-            log.debug(
+            log.warning(
                 "Repair failed for candidate %s",
                 cid,
                 exc_info=True,
diff --git a/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py b/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py
index db08f64..7cf318c 100644
--- a/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py
+++ b/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py
@@ -22,6 +22,9 @@ BLACKLIST_ADDOPTS: tuple[str, ...] = (
     "--cov",
     "--profile",
     "--junitxml",
+    "--dist",
+    "--no-flaky-report",
+    "--failed-first",
     "-n",
 )
 
@@ -96,14 +99,14 @@ def modify_addopts(
     try:
         if filename == "pyproject.toml":
             data = tomlkit.parse(content)
+            pytest_section = data.get("tool", {}).get("pytest", {})
             original_addopts = (
-                data.get("tool", {})
-                .get("pytest", {})
-                .get("ini_options", {})
-                .get("addopts", "")
+                pytest_section.get("ini_options", {}).get("addopts", "")
+                or pytest_section.get("addopts", "")
             )
             if original_addopts == "":
                 return content, False
+            uses_ini_options = "ini_options" in pytest_section and "addopts" in pytest_section.get("ini_options", {})
             if isinstance(original_addopts, list):
                 original_addopts = " ".join(original_addopts)
             original_addopts = original_addopts.replace("=", " ")
@@ -134,9 +137,14 @@ def modify_addopts(
         if new_addopts_args == addopts_args:
             return content, False
         if file_type == ".toml":
-            data["tool"]["pytest"]["ini_options"]["addopts"] = (  # type: ignore[index]
-                " ".join(new_addopts_args)
-            )
+            if uses_ini_options:
+                data["tool"]["pytest"]["ini_options"]["addopts"] = (  # type: ignore[index]
+                    " ".join(new_addopts_args)
+                )
+            else:
+                data["tool"]["pytest"]["addopts"] = (  # type: ignore[index]
+                    " ".join(new_addopts_args)
+                )
             with config_file.open("w", encoding="utf-8") as f:
                 f.write(tomlkit.dumps(data))
                 return content, True
diff --git a/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py b/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py
index 258e10a..d19258d 100644
--- a/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py
+++ b/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py
@@ -22,6 +22,16 @@ _PER_FILE_TIMEOUT = 60
 _MAX_TIMEOUT = 600
 
 
+def _base_pytest_args(rootdir: Path | None, cwd: Path) -> list[str]:
+    """Common pytest args shared across all test runner functions."""
+    return [
+        "--capture=tee-sys",
+        "-q",
+        f"--rootdir={rootdir or cwd}",
+        "-o", "addopts=",
+    ]
+
+
 def _subprocess_timeout(num_test_files: int) -> int:
     """Compute subprocess timeout from the number of test files."""
     return min(_BASE_TIMEOUT + _PER_FILE_TIMEOUT * num_test_files, _MAX_TIMEOUT)
@@ -105,9 +115,7 @@ def run_behavioral_tests(  # noqa: PLR0913
         *shlex.split(pytest_cmd),
     ]
     common_args = [
-        "--capture=tee-sys",
-        "-q",
-        f"--rootdir={rootdir or cwd}",
+        *_base_pytest_args(rootdir, cwd),
         "--codeflash_loops_scope=session",
         "--codeflash_min_loops=1",
         "--codeflash_max_loops=1",
@@ -226,9 +234,7 @@ def run_benchmarking_tests(  # noqa: PLR0913
     )
 
     pytest_args = [
-        "--capture=tee-sys",
-        "-q",
-        f"--rootdir={rootdir or cwd}",
+        *_base_pytest_args(rootdir, cwd),
         "--codeflash_loops_scope=session",
         f"--codeflash_min_loops={min_loops}",
         f"--codeflash_max_loops={max_loops}",
@@ -299,9 +305,7 @@ def run_line_profile_tests(  # noqa: PLR0913
     )
 
     pytest_args = [
-        "--capture=tee-sys",
-        "-q",
-        f"--rootdir={rootdir or cwd}",
+        *_base_pytest_args(rootdir, cwd),
         "--codeflash_loops_scope=session",
         "--codeflash_min_loops=1",
         "--codeflash_max_loops=1",
diff --git a/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py b/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py
index 17baa19..e24cf2a 100644
--- a/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py
+++ b/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py
@@ -48,13 +48,13 @@ matches_re_end = re.compile(
 
 
 def _parse_func(file_path):  # type: ignore[no-untyped-def]
-    """XML parser with huge_tree=True to handle large JUnit XML files."""
+    """XML parser with huge_tree and recover to handle large/malformed JUnit XML."""
     from lxml.etree import (  # type: ignore[import-untyped]  # noqa: PLC0415
         XMLParser,
         parse,
     )
 
-    xml_parser = XMLParser(huge_tree=True)
+    xml_parser = XMLParser(huge_tree=True, recover=True)
     return parse(file_path, xml_parser)
 
 
diff --git a/packages/codeflash-python/src/codeflash_python/verification/_baseline.py b/packages/codeflash-python/src/codeflash_python/verification/_baseline.py
index 8057a7c..55d22d2 100644
--- a/packages/codeflash-python/src/codeflash_python/verification/_baseline.py
+++ b/packages/codeflash-python/src/codeflash_python/verification/_baseline.py
@@ -236,10 +236,19 @@ def establish_original_code_baseline(  # noqa: PLR0913
         )
 
     if not behavioral_results:
-        log.warning(
-            "No behavioral test results for original code. "
-            "Skipping baseline establishment.",
-        )
+        if run_result is not None:
+            log.warning(
+                "No behavioral test results for original code (exit code %s). "
+                "Skipping baseline establishment.\nSTDOUT: %.2000s\nSTDERR: %.2000s",
+                run_result.returncode,
+                run_result.stdout,
+                run_result.stderr,
+            )
+        else:
+            log.warning(
+                "No behavioral test results for original code. "
+                "Skipping baseline establishment.",
+            )
         return None
 
     # Steps 2 & 3 run concurrently: line profiling and benchmarking