From cefd625d353be3f2283a53d7df61728ece0116ba Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Tue, 21 Apr 2026 20:41:43 -0500 Subject: [PATCH] Fix pytest 9 compat, addopts conflicts, XML recovery, and diagnostics - Use getattr for rootdir/rootpath in discovery worker (pytest 9 compat) - Add -o addopts= to all pytest invocations to override project config - Extract _base_pytest_args helper to eliminate duplication across runners - Support [tool.pytest] config section (not just [tool.pytest.ini_options]) - Add --dist, --no-flaky-report, --failed-first to BLACKLIST_ADDOPTS - Add recover=True to XMLParser for malformed JUnit XML tolerance - Log subprocess stdout/stderr on baseline and candidate test failures - Friendly warning when GitHub App not installed instead of raw error - Upgrade repair failure logging from debug to warning --- .../analysis/_discovery_worker.py | 3 ++- .../codeflash_python/codegen/_create_pr.py | 7 ++++++ .../pipeline/_candidate_eval.py | 8 +++++++ .../pipeline/_candidate_gen.py | 2 +- .../testing/_pytest_config.py | 22 +++++++++++++------ .../codeflash_python/testing/_test_runner.py | 22 +++++++++++-------- .../codeflash_python/testing/_xml_parser.py | 4 ++-- .../verification/_baseline.py | 17 ++++++++++---- 8 files changed, 61 insertions(+), 24 deletions(-) diff --git a/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py b/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py index 4ec4d06..74a6af1 100644 --- a/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py +++ b/packages/codeflash-python/src/codeflash_python/analysis/_discovery_worker.py @@ -57,7 +57,7 @@ class PytestCollectionPlugin: global pytest_rootdir, collected_tests collected_tests.extend(session.items) - pytest_rootdir = session.config.rootdir + pytest_rootdir = getattr(session.config, "rootdir", None) or getattr(session.config, "rootpath", None) # Write results immediately since pytest.main() will exit after # this callback, not always with a success code. @@ -87,6 +87,7 @@ if __name__ == "__main__": tests_root, "-p", "no:logging", + "-o", "addopts=", "--collect-only", "-m", "not skip", diff --git a/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py b/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py index 2541416..0809961 100644 --- a/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py +++ b/packages/codeflash-python/src/codeflash_python/codegen/_create_pr.py @@ -149,6 +149,13 @@ def check_create_pr( # noqa: PLR0913 pr_id, pr_url, ) + elif response.status_code == 401 and "not installed" in response.text: # noqa: PLR2004 + log.warning( + "Could not create PR: the Codeflash GitHub App is not installed on %s/%s. " + "Install it at https://github.com/apps/codeflash-ai to enable automatic PRs.", + owner, + repo, + ) else: log.error( "Failed to create PR: %s", diff --git a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py index d8be475..b027e9e 100644 --- a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py +++ b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_eval.py @@ -172,6 +172,14 @@ def run_tests_and_benchmark( # noqa: PLR0913 cid, len(diffs), ) + if not diffs and run_result is not None and run_result.returncode != 0: + log.warning( + "Candidate %s test subprocess crashed (exit code %s).\nSTDOUT: %.2000s\nSTDERR: %.2000s", + cid, + run_result.returncode, + run_result.stdout, + run_result.stderr, + ) eval_ctx.record_failed(cid) # Store diffs for potential code repair. if diffs: diff --git a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py index 6814441..f2766e8 100644 --- a/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py +++ b/packages/codeflash-python/src/codeflash_python/pipeline/_candidate_gen.py @@ -332,7 +332,7 @@ def repair_failed_candidates( try: result = code_repair(ai_client, request) except Exception: # noqa: BLE001 - log.debug( + log.warning( "Repair failed for candidate %s", cid, exc_info=True, diff --git a/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py b/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py index db08f64..7cf318c 100644 --- a/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py +++ b/packages/codeflash-python/src/codeflash_python/testing/_pytest_config.py @@ -22,6 +22,9 @@ BLACKLIST_ADDOPTS: tuple[str, ...] = ( "--cov", "--profile", "--junitxml", + "--dist", + "--no-flaky-report", + "--failed-first", "-n", ) @@ -96,14 +99,14 @@ def modify_addopts( try: if filename == "pyproject.toml": data = tomlkit.parse(content) + pytest_section = data.get("tool", {}).get("pytest", {}) original_addopts = ( - data.get("tool", {}) - .get("pytest", {}) - .get("ini_options", {}) - .get("addopts", "") + pytest_section.get("ini_options", {}).get("addopts", "") + or pytest_section.get("addopts", "") ) if original_addopts == "": return content, False + uses_ini_options = "ini_options" in pytest_section and "addopts" in pytest_section.get("ini_options", {}) if isinstance(original_addopts, list): original_addopts = " ".join(original_addopts) original_addopts = original_addopts.replace("=", " ") @@ -134,9 +137,14 @@ def modify_addopts( if new_addopts_args == addopts_args: return content, False if file_type == ".toml": - data["tool"]["pytest"]["ini_options"]["addopts"] = ( # type: ignore[index] - " ".join(new_addopts_args) - ) + if uses_ini_options: + data["tool"]["pytest"]["ini_options"]["addopts"] = ( # type: ignore[index] + " ".join(new_addopts_args) + ) + else: + data["tool"]["pytest"]["addopts"] = ( # type: ignore[index] + " ".join(new_addopts_args) + ) with config_file.open("w", encoding="utf-8") as f: f.write(tomlkit.dumps(data)) return content, True diff --git a/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py b/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py index 258e10a..d19258d 100644 --- a/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py +++ b/packages/codeflash-python/src/codeflash_python/testing/_test_runner.py @@ -22,6 +22,16 @@ _PER_FILE_TIMEOUT = 60 _MAX_TIMEOUT = 600 +def _base_pytest_args(rootdir: Path | None, cwd: Path) -> list[str]: + """Common pytest args shared across all test runner functions.""" + return [ + "--capture=tee-sys", + "-q", + f"--rootdir={rootdir or cwd}", + "-o", "addopts=", + ] + + def _subprocess_timeout(num_test_files: int) -> int: """Compute subprocess timeout from the number of test files.""" return min(_BASE_TIMEOUT + _PER_FILE_TIMEOUT * num_test_files, _MAX_TIMEOUT) @@ -105,9 +115,7 @@ def run_behavioral_tests( # noqa: PLR0913 *shlex.split(pytest_cmd), ] common_args = [ - "--capture=tee-sys", - "-q", - f"--rootdir={rootdir or cwd}", + *_base_pytest_args(rootdir, cwd), "--codeflash_loops_scope=session", "--codeflash_min_loops=1", "--codeflash_max_loops=1", @@ -226,9 +234,7 @@ def run_benchmarking_tests( # noqa: PLR0913 ) pytest_args = [ - "--capture=tee-sys", - "-q", - f"--rootdir={rootdir or cwd}", + *_base_pytest_args(rootdir, cwd), "--codeflash_loops_scope=session", f"--codeflash_min_loops={min_loops}", f"--codeflash_max_loops={max_loops}", @@ -299,9 +305,7 @@ def run_line_profile_tests( # noqa: PLR0913 ) pytest_args = [ - "--capture=tee-sys", - "-q", - f"--rootdir={rootdir or cwd}", + *_base_pytest_args(rootdir, cwd), "--codeflash_loops_scope=session", "--codeflash_min_loops=1", "--codeflash_max_loops=1", diff --git a/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py b/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py index 17baa19..e24cf2a 100644 --- a/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py +++ b/packages/codeflash-python/src/codeflash_python/testing/_xml_parser.py @@ -48,13 +48,13 @@ matches_re_end = re.compile( def _parse_func(file_path): # type: ignore[no-untyped-def] - """XML parser with huge_tree=True to handle large JUnit XML files.""" + """XML parser with huge_tree and recover to handle large/malformed JUnit XML.""" from lxml.etree import ( # type: ignore[import-untyped] # noqa: PLC0415 XMLParser, parse, ) - xml_parser = XMLParser(huge_tree=True) + xml_parser = XMLParser(huge_tree=True, recover=True) return parse(file_path, xml_parser) diff --git a/packages/codeflash-python/src/codeflash_python/verification/_baseline.py b/packages/codeflash-python/src/codeflash_python/verification/_baseline.py index 8057a7c..55d22d2 100644 --- a/packages/codeflash-python/src/codeflash_python/verification/_baseline.py +++ b/packages/codeflash-python/src/codeflash_python/verification/_baseline.py @@ -236,10 +236,19 @@ def establish_original_code_baseline( # noqa: PLR0913 ) if not behavioral_results: - log.warning( - "No behavioral test results for original code. " - "Skipping baseline establishment.", - ) + if run_result is not None: + log.warning( + "No behavioral test results for original code (exit code %s). " + "Skipping baseline establishment.\nSTDOUT: %.2000s\nSTDERR: %.2000s", + run_result.returncode, + run_result.stdout, + run_result.stderr, + ) + else: + log.warning( + "No behavioral test results for original code. " + "Skipping baseline establishment.", + ) return None # Steps 2 & 3 run concurrently: line profiling and benchmarking