Add unit tests for async test runners and candidate evaluation

29 new tests in test_test_runner.py covering async_execute_test_subprocess, async_run_behavioral_tests, async_run_benchmarking_tests, _base_pytest_args, replay test path, and coverage path. 21 new tests in test_candidate_eval.py covering evaluate_candidate, rank_candidates, build_benchmark_details, log_evaluation_results, and async_run_tests_and_benchmark.
2026-05-04 18:25:19 +00:00 · 2026-04-23 00:24:59 -05:00 · 2026-04-23 00:24:59 -05:00 · 8defba8a72
commit 8defba8a72
parent 8d308fe8e8
2 changed files with 1773 additions and 2 deletions
--- a/packages/codeflash-python/tests/test_candidate_eval.py
+++ b/packages/codeflash-python/tests/test_candidate_eval.py
--- a/packages/codeflash-python/tests/test_test_runner.py
+++ b/packages/codeflash-python/tests/test_test_runner.py
@ -3,16 +3,20 @@ from __future__ import annotations
 import subprocess
 import sys
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import AsyncMock, patch

 import pytest

-from codeflash_python.test_discovery.models import TestType
+from codeflash_python.test_discovery.models import TestsInFile, TestType
 from codeflash_python.testing._test_runner import (
    _BASE_TIMEOUT,
    _MAX_TIMEOUT,
    _PER_FILE_TIMEOUT,
+    _base_pytest_args,
    _subprocess_timeout,
+    async_execute_test_subprocess,
+    async_run_behavioral_tests,
+    async_run_benchmarking_tests,
    execute_test_subprocess,
    run_behavioral_tests,
    run_benchmarking_tests,
@ -443,3 +447,683 @@ class TestExecuteTestSubprocessIntegration:
        assert result.returncode != 0
        assert "" == result.stdout
        assert "" == result.stderr
+
+
+def make_replay_test_files(tmp_path: Path) -> TestFiles:
+    """Create a TestFiles with a REPLAY_TEST entry containing tests_in_file."""
+    instrumented = tmp_path / "test_replay_instrumented.py"
+    instrumented.touch()
+    tif1 = TestsInFile(
+        test_file=instrumented,
+        test_class=None,
+        test_function="test_replay_one",
+        test_type=TestType.REPLAY_TEST,
+    )
+    tif2 = TestsInFile(
+        test_file=instrumented,
+        test_class=None,
+        test_function="test_replay_two",
+        test_type=TestType.REPLAY_TEST,
+    )
+    tf = TestFile(
+        original_file_path=tmp_path / "test_orig.py",
+        instrumented_behavior_file_path=instrumented,
+        benchmarking_file_path=None,
+        test_type=TestType.REPLAY_TEST,
+        tests_in_file=(tif1, tif2),
+    )
+    return TestFiles(test_files=[tf])
+
+
+class TestBasePytestArgs:
+    """_base_pytest_args common argument generation."""
+
+    def test_uses_cwd_when_no_rootdir(self) -> None:
+        """Falls back to cwd when rootdir is None."""
+        cwd = Path("/project")
+        args = _base_pytest_args(rootdir=None, cwd=cwd)
+        assert "--rootdir=/project" in args
+
+    def test_uses_rootdir_when_provided(self) -> None:
+        """Uses rootdir over cwd when both are given."""
+        args = _base_pytest_args(
+            rootdir=Path("/custom/root"),
+            cwd=Path("/project"),
+        )
+        assert "--rootdir=/custom/root" in args
+
+    def test_always_includes_capture_and_quiet(self) -> None:
+        """Always includes --capture=tee-sys and -q."""
+        args = _base_pytest_args(rootdir=None, cwd=Path("/p"))
+        assert "--capture=tee-sys" in args
+        assert "-q" in args
+
+    def test_resets_addopts(self) -> None:
+        """Resets addopts to empty to avoid inheriting user config."""
+        args = _base_pytest_args(rootdir=None, cwd=Path("/p"))
+        idx = args.index("-o")
+        assert "addopts=" == args[idx + 1]
+
+
+class TestRunBehavioralTestsReplayPath:
+    """run_behavioral_tests REPLAY_TEST branch."""
+
+    @patch("codeflash_python.testing._test_runner.execute_test_subprocess")
+    def test_replay_test_uses_node_ids(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """REPLAY_TEST files expand to individual test function node ids."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_replay_test_files(tmp_path)
+
+        run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "::test_replay_one" in cmd_str
+        assert "::test_replay_two" in cmd_str
+
+    @patch("codeflash_python.testing._test_runner.execute_test_subprocess")
+    def test_replay_test_deduplicates(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Duplicate node ids from replay tests are deduplicated."""
+        mock_exec.return_value = make_completed_process()
+        instrumented = tmp_path / "test_replay.py"
+        instrumented.touch()
+        tif = TestsInFile(
+            test_file=instrumented,
+            test_class=None,
+            test_function="test_dup",
+            test_type=TestType.REPLAY_TEST,
+        )
+        tf1 = TestFile(
+            original_file_path=tmp_path / "test_orig.py",
+            instrumented_behavior_file_path=instrumented,
+            test_type=TestType.REPLAY_TEST,
+            tests_in_file=(tif, tif),
+        )
+        files = TestFiles(test_files=[tf1])
+
+        run_behavioral_tests(
+            files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        node_ids = [arg for arg in cmd if "::test_dup" in arg]
+        assert 1 == len(node_ids)
+
+
+class TestRunBehavioralTestsCoverage:
+    """run_behavioral_tests coverage path (enable_coverage=True)."""
+
+    @patch("codeflash_python.testing._test_runner.execute_test_subprocess")
+    @patch(
+        "codeflash_python.analysis._coverage.prepare_coverage_files",
+    )
+    @patch(
+        "codeflash_python.verification._baseline.jit_disabled_env",
+    )
+    def test_coverage_uses_coverage_cmd(
+        self,
+        mock_jit,
+        mock_prepare,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """When enable_coverage=True, uses 'coverage run' command."""
+        mock_jit.return_value = {"PYTHONDONTWRITEBYTECODE": "1"}
+        cov_db = tmp_path / ".coverage"
+        cov_cfg = tmp_path / ".coveragerc"
+        cov_db.touch()
+        cov_cfg.touch()
+        mock_prepare.return_value = (cov_db, cov_cfg)
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        result_path, _, cov_db_result, cov_cfg_result = run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            enable_coverage=True,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "coverage" in cmd_str
+        assert "run" in cmd_str
+        assert f"--rcfile={cov_cfg.as_posix()}" in cmd_str
+        assert cov_db == cov_db_result
+        assert cov_cfg == cov_cfg_result
+
+    @patch("codeflash_python.testing._test_runner.execute_test_subprocess")
+    @patch(
+        "codeflash_python.analysis._coverage.prepare_coverage_files",
+    )
+    @patch(
+        "codeflash_python.verification._baseline.jit_disabled_env",
+    )
+    def test_coverage_does_not_block_cov_plugin(
+        self,
+        mock_jit,
+        mock_prepare,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Coverage path does not blocklist the cov pytest plugin."""
+        mock_jit.return_value = {}
+        mock_prepare.return_value = (
+            tmp_path / ".coverage",
+            tmp_path / ".coveragerc",
+        )
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            enable_coverage=True,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        assert "-p no:cov" not in cmd
+
+    @patch("codeflash_python.testing._test_runner.execute_test_subprocess")
+    @patch(
+        "codeflash_python.analysis._coverage.prepare_coverage_files",
+    )
+    @patch(
+        "codeflash_python.verification._baseline.jit_disabled_env",
+    )
+    def test_coverage_merges_jit_env(
+        self,
+        mock_jit,
+        mock_prepare,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Coverage path merges JIT-disabled env vars into test env."""
+        mock_jit.return_value = {"PYTHONDONTWRITEBYTECODE": "1"}
+        mock_prepare.return_value = (
+            tmp_path / ".coverage",
+            tmp_path / ".coveragerc",
+        )
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            enable_coverage=True,
+        )
+
+        env = mock_exec.call_args.kwargs["env"]
+        assert "1" == env["PYTHONDONTWRITEBYTECODE"]
+
+
+class TestAsyncExecuteTestSubprocess:
+    """async_execute_test_subprocess async subprocess invocation."""
+
+    @pytest.mark.asyncio
+    async def test_basic_execution(self) -> None:
+        """Runs a simple command and captures stdout."""
+        result = await async_execute_test_subprocess(
+            [sys.executable, "-c", "print('async hello')"],
+            cwd=Path.cwd(),
+            env=None,
+        )
+        assert 0 == result.returncode
+        assert "async hello" in result.stdout
+
+    @pytest.mark.asyncio
+    async def test_captures_stderr(self) -> None:
+        """Captures stderr from the subprocess."""
+        result = await async_execute_test_subprocess(
+            [
+                sys.executable,
+                "-c",
+                "import sys; sys.stderr.write('async err\\n')",
+            ],
+            cwd=Path.cwd(),
+            env=None,
+        )
+        assert "async err" in result.stderr
+
+    @pytest.mark.asyncio
+    async def test_nonzero_exit(self) -> None:
+        """Non-zero exit code is captured without raising."""
+        result = await async_execute_test_subprocess(
+            [sys.executable, "-c", "raise SystemExit(2)"],
+            cwd=Path.cwd(),
+            env=None,
+        )
+        assert 2 == result.returncode
+
+    @pytest.mark.asyncio
+    async def test_timeout_returns_minus_one(self) -> None:
+        """Returns returncode=-1 and empty output on timeout."""
+        result = await async_execute_test_subprocess(
+            [sys.executable, "-c", "import time; time.sleep(30)"],
+            cwd=Path.cwd(),
+            env=None,
+            timeout=1,
+        )
+        assert -1 == result.returncode
+        assert isinstance(result.stdout, str)
+        assert isinstance(result.stderr, str)
+
+    @pytest.mark.asyncio
+    async def test_default_timeout_is_600(self) -> None:
+        """Default timeout parameter is 600."""
+        with patch(
+            "codeflash_python.testing._test_runner.asyncio.create_subprocess_exec",
+            new_callable=AsyncMock,
+        ) as mock_create:
+            mock_proc = AsyncMock()
+            mock_proc.communicate.return_value = (b"out", b"err")
+            mock_proc.returncode = 0
+            mock_create.return_value = mock_proc
+
+            with patch(
+                "codeflash_python.testing._test_runner.asyncio.wait_for",
+                new_callable=AsyncMock,
+            ) as mock_wait:
+                mock_wait.return_value = (b"out", b"err")
+                await async_execute_test_subprocess(
+                    ["pytest"],
+                    cwd=Path("/project"),
+                    env=None,
+                )
+                assert 600 == mock_wait.call_args[1]["timeout"]
+
+    @pytest.mark.asyncio
+    async def test_empty_stdout_returns_empty_string(self) -> None:
+        """Empty stdout bytes decode to empty string, not None."""
+        result = await async_execute_test_subprocess(
+            [sys.executable, "-c", "pass"],
+            cwd=Path.cwd(),
+            env=None,
+        )
+        assert "" == result.stdout
+
+
+class TestAsyncRunBehavioralTests:
+    """async_run_behavioral_tests async behavioral test runner."""
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_builds_correct_command(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Command includes session scope and min/max loops of 1."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        await async_run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "--codeflash_loops_scope=session" in cmd_str
+        assert "--codeflash_min_loops=1" in cmd_str
+        assert "--codeflash_max_loops=1" in cmd_str
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_returns_four_tuple_with_none_coverage(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Returns (Path, CompletedProcess, None, None) — no coverage support."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        result_path, proc, cov_db, cov_cfg = await async_run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+        assert isinstance(result_path, Path)
+        assert "pytest_results.xml" in str(result_path)
+        assert cov_db is None
+        assert cov_cfg is None
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_custom_result_file_name(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Respects custom result_file_name parameter."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        result_path, _, _, _ = await async_run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            result_file_name="custom_results.xml",
+        )
+        assert "custom_results.xml" in str(result_path)
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_blocklists_plugins(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Blocklists benchmark, codspeed, xdist, sugar plugins."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        await async_run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "-p no:benchmark" in cmd_str
+        assert "-p no:xdist" in cmd_str
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_replay_test_uses_node_ids(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """REPLAY_TEST files expand to individual test function node ids."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_replay_test_files(tmp_path)
+
+        await async_run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "::test_replay_one" in cmd_str
+        assert "::test_replay_two" in cmd_str
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_sets_pytest_plugin_env(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Sets PYTEST_PLUGINS env var for codeflash plugin."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        await async_run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        env = mock_exec.call_args.kwargs["env"]
+        assert "PYTEST_PLUGINS" in env
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_timeout_appended_when_provided(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """When timeout is set, --timeout=N appears in the command."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(tmp_path)
+
+        await async_run_behavioral_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            timeout=42,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        assert "--timeout=42" in cmd
+
+
+class TestAsyncRunBenchmarkingTests:
+    """async_run_benchmarking_tests async benchmarking test runner."""
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_includes_loop_params(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Command includes min_loops, max_loops, target_duration."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(
+            tmp_path,
+            with_benchmarking=True,
+        )
+
+        await async_run_benchmarking_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            min_loops=10,
+            max_loops=5000,
+            target_duration_seconds=5.0,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "--codeflash_min_loops=10" in cmd_str
+        assert "--codeflash_max_loops=5000" in cmd_str
+        assert "--codeflash_seconds=5.0" in cmd_str
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_stability_check_flag(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Command includes --codeflash_stability_check=true."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(
+            tmp_path,
+            with_benchmarking=True,
+        )
+
+        await async_run_benchmarking_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "--codeflash_stability_check=true" in cmd_str
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_returns_result_path(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Returns a Path for XML results as first element."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(
+            tmp_path,
+            with_benchmarking=True,
+        )
+
+        result_path, _ = await async_run_benchmarking_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+        assert isinstance(result_path, Path)
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_custom_result_file_name(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Respects custom result_file_name parameter."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(
+            tmp_path,
+            with_benchmarking=True,
+        )
+
+        result_path, _ = await async_run_benchmarking_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            result_file_name="bench_results.xml",
+        )
+        assert "bench_results.xml" in str(result_path)
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_blocklists_profiling_plugins(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Blocklists codspeed, cov, benchmark, profiling, xdist, sugar."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(
+            tmp_path,
+            with_benchmarking=True,
+        )
+
+        await async_run_benchmarking_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        cmd_str = " ".join(cmd)
+        assert "-p no:codspeed" in cmd_str
+        assert "-p no:cov" in cmd_str
+        assert "-p no:benchmark" in cmd_str
+        assert "-p no:profiling" in cmd_str
+        assert "-p no:xdist" in cmd_str
+        assert "-p no:sugar" in cmd_str
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_sets_pytest_plugin_env(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """Sets PYTEST_PLUGINS env var for codeflash plugin."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(
+            tmp_path,
+            with_benchmarking=True,
+        )
+
+        await async_run_benchmarking_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+        )
+
+        env = mock_exec.call_args.kwargs["env"]
+        assert "PYTEST_PLUGINS" in env
+
+    @pytest.mark.asyncio
+    @patch(
+        "codeflash_python.testing._test_runner.async_execute_test_subprocess",
+        new_callable=AsyncMock,
+    )
+    async def test_timeout_appended_when_provided(
+        self,
+        mock_exec,
+        tmp_path: Path,
+    ) -> None:
+        """When timeout is set, --timeout=N appears in the command."""
+        mock_exec.return_value = make_completed_process()
+        test_files = make_test_files(
+            tmp_path,
+            with_benchmarking=True,
+        )
+
+        await async_run_benchmarking_tests(
+            test_files,
+            test_env={"PATH": "/bin"},
+            cwd=tmp_path,
+            timeout=99,
+        )
+
+        cmd = mock_exec.call_args[0][0]
+        assert "--timeout=99" in cmd