Add unit tests for async test runners and candidate evaluation

29 new tests in test_test_runner.py covering async_execute_test_subprocess,
async_run_behavioral_tests, async_run_benchmarking_tests, _base_pytest_args,
replay test path, and coverage path.

21 new tests in test_candidate_eval.py covering evaluate_candidate,
rank_candidates, build_benchmark_details, log_evaluation_results, and
async_run_tests_and_benchmark.
This commit is contained in:
Kevin Turcios 2026-04-23 00:24:59 -05:00
parent 8d308fe8e8
commit 8defba8a72
2 changed files with 1773 additions and 2 deletions

File diff suppressed because it is too large Load diff

View file

@ -3,16 +3,20 @@ from __future__ import annotations
import subprocess
import sys
from pathlib import Path
from unittest.mock import patch
from unittest.mock import AsyncMock, patch
import pytest
from codeflash_python.test_discovery.models import TestType
from codeflash_python.test_discovery.models import TestsInFile, TestType
from codeflash_python.testing._test_runner import (
_BASE_TIMEOUT,
_MAX_TIMEOUT,
_PER_FILE_TIMEOUT,
_base_pytest_args,
_subprocess_timeout,
async_execute_test_subprocess,
async_run_behavioral_tests,
async_run_benchmarking_tests,
execute_test_subprocess,
run_behavioral_tests,
run_benchmarking_tests,
@ -443,3 +447,683 @@ class TestExecuteTestSubprocessIntegration:
assert result.returncode != 0
assert "" == result.stdout
assert "" == result.stderr
def make_replay_test_files(tmp_path: Path) -> TestFiles:
"""Create a TestFiles with a REPLAY_TEST entry containing tests_in_file."""
instrumented = tmp_path / "test_replay_instrumented.py"
instrumented.touch()
tif1 = TestsInFile(
test_file=instrumented,
test_class=None,
test_function="test_replay_one",
test_type=TestType.REPLAY_TEST,
)
tif2 = TestsInFile(
test_file=instrumented,
test_class=None,
test_function="test_replay_two",
test_type=TestType.REPLAY_TEST,
)
tf = TestFile(
original_file_path=tmp_path / "test_orig.py",
instrumented_behavior_file_path=instrumented,
benchmarking_file_path=None,
test_type=TestType.REPLAY_TEST,
tests_in_file=(tif1, tif2),
)
return TestFiles(test_files=[tf])
class TestBasePytestArgs:
"""_base_pytest_args common argument generation."""
def test_uses_cwd_when_no_rootdir(self) -> None:
"""Falls back to cwd when rootdir is None."""
cwd = Path("/project")
args = _base_pytest_args(rootdir=None, cwd=cwd)
assert "--rootdir=/project" in args
def test_uses_rootdir_when_provided(self) -> None:
"""Uses rootdir over cwd when both are given."""
args = _base_pytest_args(
rootdir=Path("/custom/root"),
cwd=Path("/project"),
)
assert "--rootdir=/custom/root" in args
def test_always_includes_capture_and_quiet(self) -> None:
"""Always includes --capture=tee-sys and -q."""
args = _base_pytest_args(rootdir=None, cwd=Path("/p"))
assert "--capture=tee-sys" in args
assert "-q" in args
def test_resets_addopts(self) -> None:
"""Resets addopts to empty to avoid inheriting user config."""
args = _base_pytest_args(rootdir=None, cwd=Path("/p"))
idx = args.index("-o")
assert "addopts=" == args[idx + 1]
class TestRunBehavioralTestsReplayPath:
"""run_behavioral_tests REPLAY_TEST branch."""
@patch("codeflash_python.testing._test_runner.execute_test_subprocess")
def test_replay_test_uses_node_ids(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""REPLAY_TEST files expand to individual test function node ids."""
mock_exec.return_value = make_completed_process()
test_files = make_replay_test_files(tmp_path)
run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "::test_replay_one" in cmd_str
assert "::test_replay_two" in cmd_str
@patch("codeflash_python.testing._test_runner.execute_test_subprocess")
def test_replay_test_deduplicates(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Duplicate node ids from replay tests are deduplicated."""
mock_exec.return_value = make_completed_process()
instrumented = tmp_path / "test_replay.py"
instrumented.touch()
tif = TestsInFile(
test_file=instrumented,
test_class=None,
test_function="test_dup",
test_type=TestType.REPLAY_TEST,
)
tf1 = TestFile(
original_file_path=tmp_path / "test_orig.py",
instrumented_behavior_file_path=instrumented,
test_type=TestType.REPLAY_TEST,
tests_in_file=(tif, tif),
)
files = TestFiles(test_files=[tf1])
run_behavioral_tests(
files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
cmd = mock_exec.call_args[0][0]
node_ids = [arg for arg in cmd if "::test_dup" in arg]
assert 1 == len(node_ids)
class TestRunBehavioralTestsCoverage:
"""run_behavioral_tests coverage path (enable_coverage=True)."""
@patch("codeflash_python.testing._test_runner.execute_test_subprocess")
@patch(
"codeflash_python.analysis._coverage.prepare_coverage_files",
)
@patch(
"codeflash_python.verification._baseline.jit_disabled_env",
)
def test_coverage_uses_coverage_cmd(
self,
mock_jit,
mock_prepare,
mock_exec,
tmp_path: Path,
) -> None:
"""When enable_coverage=True, uses 'coverage run' command."""
mock_jit.return_value = {"PYTHONDONTWRITEBYTECODE": "1"}
cov_db = tmp_path / ".coverage"
cov_cfg = tmp_path / ".coveragerc"
cov_db.touch()
cov_cfg.touch()
mock_prepare.return_value = (cov_db, cov_cfg)
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
result_path, _, cov_db_result, cov_cfg_result = run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
enable_coverage=True,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "coverage" in cmd_str
assert "run" in cmd_str
assert f"--rcfile={cov_cfg.as_posix()}" in cmd_str
assert cov_db == cov_db_result
assert cov_cfg == cov_cfg_result
@patch("codeflash_python.testing._test_runner.execute_test_subprocess")
@patch(
"codeflash_python.analysis._coverage.prepare_coverage_files",
)
@patch(
"codeflash_python.verification._baseline.jit_disabled_env",
)
def test_coverage_does_not_block_cov_plugin(
self,
mock_jit,
mock_prepare,
mock_exec,
tmp_path: Path,
) -> None:
"""Coverage path does not blocklist the cov pytest plugin."""
mock_jit.return_value = {}
mock_prepare.return_value = (
tmp_path / ".coverage",
tmp_path / ".coveragerc",
)
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
enable_coverage=True,
)
cmd = mock_exec.call_args[0][0]
assert "-p no:cov" not in cmd
@patch("codeflash_python.testing._test_runner.execute_test_subprocess")
@patch(
"codeflash_python.analysis._coverage.prepare_coverage_files",
)
@patch(
"codeflash_python.verification._baseline.jit_disabled_env",
)
def test_coverage_merges_jit_env(
self,
mock_jit,
mock_prepare,
mock_exec,
tmp_path: Path,
) -> None:
"""Coverage path merges JIT-disabled env vars into test env."""
mock_jit.return_value = {"PYTHONDONTWRITEBYTECODE": "1"}
mock_prepare.return_value = (
tmp_path / ".coverage",
tmp_path / ".coveragerc",
)
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
enable_coverage=True,
)
env = mock_exec.call_args.kwargs["env"]
assert "1" == env["PYTHONDONTWRITEBYTECODE"]
class TestAsyncExecuteTestSubprocess:
"""async_execute_test_subprocess async subprocess invocation."""
@pytest.mark.asyncio
async def test_basic_execution(self) -> None:
"""Runs a simple command and captures stdout."""
result = await async_execute_test_subprocess(
[sys.executable, "-c", "print('async hello')"],
cwd=Path.cwd(),
env=None,
)
assert 0 == result.returncode
assert "async hello" in result.stdout
@pytest.mark.asyncio
async def test_captures_stderr(self) -> None:
"""Captures stderr from the subprocess."""
result = await async_execute_test_subprocess(
[
sys.executable,
"-c",
"import sys; sys.stderr.write('async err\\n')",
],
cwd=Path.cwd(),
env=None,
)
assert "async err" in result.stderr
@pytest.mark.asyncio
async def test_nonzero_exit(self) -> None:
"""Non-zero exit code is captured without raising."""
result = await async_execute_test_subprocess(
[sys.executable, "-c", "raise SystemExit(2)"],
cwd=Path.cwd(),
env=None,
)
assert 2 == result.returncode
@pytest.mark.asyncio
async def test_timeout_returns_minus_one(self) -> None:
"""Returns returncode=-1 and empty output on timeout."""
result = await async_execute_test_subprocess(
[sys.executable, "-c", "import time; time.sleep(30)"],
cwd=Path.cwd(),
env=None,
timeout=1,
)
assert -1 == result.returncode
assert isinstance(result.stdout, str)
assert isinstance(result.stderr, str)
@pytest.mark.asyncio
async def test_default_timeout_is_600(self) -> None:
"""Default timeout parameter is 600."""
with patch(
"codeflash_python.testing._test_runner.asyncio.create_subprocess_exec",
new_callable=AsyncMock,
) as mock_create:
mock_proc = AsyncMock()
mock_proc.communicate.return_value = (b"out", b"err")
mock_proc.returncode = 0
mock_create.return_value = mock_proc
with patch(
"codeflash_python.testing._test_runner.asyncio.wait_for",
new_callable=AsyncMock,
) as mock_wait:
mock_wait.return_value = (b"out", b"err")
await async_execute_test_subprocess(
["pytest"],
cwd=Path("/project"),
env=None,
)
assert 600 == mock_wait.call_args[1]["timeout"]
@pytest.mark.asyncio
async def test_empty_stdout_returns_empty_string(self) -> None:
"""Empty stdout bytes decode to empty string, not None."""
result = await async_execute_test_subprocess(
[sys.executable, "-c", "pass"],
cwd=Path.cwd(),
env=None,
)
assert "" == result.stdout
class TestAsyncRunBehavioralTests:
"""async_run_behavioral_tests async behavioral test runner."""
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_builds_correct_command(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Command includes session scope and min/max loops of 1."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
await async_run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "--codeflash_loops_scope=session" in cmd_str
assert "--codeflash_min_loops=1" in cmd_str
assert "--codeflash_max_loops=1" in cmd_str
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_returns_four_tuple_with_none_coverage(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Returns (Path, CompletedProcess, None, None) — no coverage support."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
result_path, proc, cov_db, cov_cfg = await async_run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
assert isinstance(result_path, Path)
assert "pytest_results.xml" in str(result_path)
assert cov_db is None
assert cov_cfg is None
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_custom_result_file_name(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Respects custom result_file_name parameter."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
result_path, _, _, _ = await async_run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
result_file_name="custom_results.xml",
)
assert "custom_results.xml" in str(result_path)
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_blocklists_plugins(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Blocklists benchmark, codspeed, xdist, sugar plugins."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
await async_run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "-p no:benchmark" in cmd_str
assert "-p no:xdist" in cmd_str
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_replay_test_uses_node_ids(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""REPLAY_TEST files expand to individual test function node ids."""
mock_exec.return_value = make_completed_process()
test_files = make_replay_test_files(tmp_path)
await async_run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "::test_replay_one" in cmd_str
assert "::test_replay_two" in cmd_str
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_sets_pytest_plugin_env(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Sets PYTEST_PLUGINS env var for codeflash plugin."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
await async_run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
env = mock_exec.call_args.kwargs["env"]
assert "PYTEST_PLUGINS" in env
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_timeout_appended_when_provided(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""When timeout is set, --timeout=N appears in the command."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(tmp_path)
await async_run_behavioral_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
timeout=42,
)
cmd = mock_exec.call_args[0][0]
assert "--timeout=42" in cmd
class TestAsyncRunBenchmarkingTests:
"""async_run_benchmarking_tests async benchmarking test runner."""
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_includes_loop_params(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Command includes min_loops, max_loops, target_duration."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(
tmp_path,
with_benchmarking=True,
)
await async_run_benchmarking_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
min_loops=10,
max_loops=5000,
target_duration_seconds=5.0,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "--codeflash_min_loops=10" in cmd_str
assert "--codeflash_max_loops=5000" in cmd_str
assert "--codeflash_seconds=5.0" in cmd_str
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_stability_check_flag(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Command includes --codeflash_stability_check=true."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(
tmp_path,
with_benchmarking=True,
)
await async_run_benchmarking_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "--codeflash_stability_check=true" in cmd_str
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_returns_result_path(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Returns a Path for XML results as first element."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(
tmp_path,
with_benchmarking=True,
)
result_path, _ = await async_run_benchmarking_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
assert isinstance(result_path, Path)
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_custom_result_file_name(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Respects custom result_file_name parameter."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(
tmp_path,
with_benchmarking=True,
)
result_path, _ = await async_run_benchmarking_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
result_file_name="bench_results.xml",
)
assert "bench_results.xml" in str(result_path)
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_blocklists_profiling_plugins(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Blocklists codspeed, cov, benchmark, profiling, xdist, sugar."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(
tmp_path,
with_benchmarking=True,
)
await async_run_benchmarking_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
cmd = mock_exec.call_args[0][0]
cmd_str = " ".join(cmd)
assert "-p no:codspeed" in cmd_str
assert "-p no:cov" in cmd_str
assert "-p no:benchmark" in cmd_str
assert "-p no:profiling" in cmd_str
assert "-p no:xdist" in cmd_str
assert "-p no:sugar" in cmd_str
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_sets_pytest_plugin_env(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""Sets PYTEST_PLUGINS env var for codeflash plugin."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(
tmp_path,
with_benchmarking=True,
)
await async_run_benchmarking_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
)
env = mock_exec.call_args.kwargs["env"]
assert "PYTEST_PLUGINS" in env
@pytest.mark.asyncio
@patch(
"codeflash_python.testing._test_runner.async_execute_test_subprocess",
new_callable=AsyncMock,
)
async def test_timeout_appended_when_provided(
self,
mock_exec,
tmp_path: Path,
) -> None:
"""When timeout is set, --timeout=N appears in the command."""
mock_exec.return_value = make_completed_process()
test_files = make_test_files(
tmp_path,
with_benchmarking=True,
)
await async_run_benchmarking_tests(
test_files,
test_env={"PATH": "/bin"},
cwd=tmp_path,
timeout=99,
)
cmd = mock_exec.call_args[0][0]
assert "--timeout=99" in cmd