codeflash-agent/packages/codeflash-python/tests/test_model_test_results.py
Kevin Turcios eb6a0be717 feat: add dual-clock instrumentation (wall + CPU time) and remove dead binary parser
Measure both wall-clock time (perf_counter_ns) and CPU thread time
(thread_time_ns) in instrumented test code. cpu_runtime is now a required
int field on FunctionTestInvocation, stored in the SQLite test_results
table as a 10th column.

Also fixes the sleeptime.py bug (10e9 → 1e9 divisor) and removes the
binary pickle parser (parse_test_return_values_bin) since no writer
exists in the current codebase — SQLite is the sole data capture path.
2026-04-24 02:21:22 -05:00

509 lines
16 KiB
Python

from __future__ import annotations
from pathlib import Path
import pytest
from codeflash_python._model import VerificationType
from codeflash_python.benchmarking.models import BenchmarkKey
from codeflash_python.test_discovery.models import TestType
from codeflash_python.testing.models import (
FunctionTestInvocation,
InvocationId,
TestConfig,
TestFile,
TestFiles,
TestResults,
)
def make_invocation_id(
*,
module: str = "tests.test_foo",
cls: str | None = "TestFoo",
func: str | None = "test_bar",
target: str = "bar",
iteration: str | None = "0",
) -> InvocationId:
"""Create an InvocationId with sensible defaults."""
return InvocationId(
test_module_path=module,
test_class_name=cls,
test_function_name=func,
function_getting_tested=target,
iteration_id=iteration,
)
def make_invocation(
*,
loop_index: int = 0,
inv_id: InvocationId | None = None,
did_pass: bool = True,
runtime: int | None = 100,
) -> FunctionTestInvocation:
"""Create a FunctionTestInvocation with sensible defaults."""
return FunctionTestInvocation(
loop_index=loop_index,
id=inv_id or make_invocation_id(),
file_name=Path("tests/test_foo.py"),
did_pass=did_pass,
runtime=runtime,
test_framework="pytest",
test_type=TestType.EXISTING_UNIT_TEST,
return_value=None,
cpu_runtime=0,
timed_out=False,
)
class TestInvocationId:
"""InvocationId identity and parsing."""
def test_id_with_class(self) -> None:
"""id() includes class prefix when test_class_name is set."""
inv = make_invocation_id(cls="TestFoo", func="test_bar")
assert "tests.test_foo:TestFoo.test_bar:bar:0" == inv.id()
def test_id_without_class(self) -> None:
"""id() has no class prefix when test_class_name is None."""
inv = make_invocation_id(cls=None, func="test_bar")
assert "tests.test_foo:test_bar:bar:0" == inv.id()
def test_fn_qualified_name_with_class(self) -> None:
"""Returns 'Class.function' when class is present."""
inv = make_invocation_id(cls="TestFoo", func="test_bar")
assert "TestFoo.test_bar" == inv.test_fn_qualified_name()
def test_fn_qualified_name_without_class(self) -> None:
"""Returns just 'function' when class is None."""
inv = make_invocation_id(cls=None, func="test_bar")
assert "test_bar" == inv.test_fn_qualified_name()
def test_from_str_id_with_class(self) -> None:
"""Parses 'module:Class.test:func:iter' correctly."""
result = InvocationId.from_str_id(
"tests.test_foo:TestFoo.test_bar:bar:0",
)
assert "tests.test_foo" == result.test_module_path
assert "TestFoo" == result.test_class_name
assert "test_bar" == result.test_function_name
assert "bar" == result.function_getting_tested
assert "0" == result.iteration_id
def test_from_str_id_without_class(self) -> None:
"""Parses 'module:test:func:iter' when no class present."""
result = InvocationId.from_str_id(
"tests.test_foo:test_bar:bar:0",
)
assert result.test_class_name is None
assert "test_bar" == result.test_function_name
def test_from_str_id_with_iteration_override(self) -> None:
"""iteration_id parameter overrides the one in the string."""
result = InvocationId.from_str_id(
"tests.test_foo:test_bar:bar:0",
iteration_id="5",
)
assert "5" == result.iteration_id
def test_from_str_id_invalid(self) -> None:
"""Raises ValueError for malformed input."""
with pytest.raises(ValueError, match="Expected 4"):
InvocationId.from_str_id("bad:input")
def test_frozen(self) -> None:
"""Cannot set attributes on frozen instance."""
inv = make_invocation_id()
with pytest.raises(AttributeError):
inv.test_module_path = "other" # type: ignore[misc]
class TestFunctionTestInvocation:
"""FunctionTestInvocation data and properties."""
def test_unique_invocation_loop_id(self) -> None:
"""Combines loop_index and id string."""
inv = make_invocation(loop_index=3)
expected = f"3:{inv.id.id()}"
assert expected == inv.unique_invocation_loop_id
def test_default_verification_type(self) -> None:
"""Defaults to FUNCTION_CALL when not specified."""
inv = make_invocation()
assert VerificationType.FUNCTION_CALL == inv.verification_type
def test_explicit_verification_type(self) -> None:
"""Accepts explicit verification type."""
inv = FunctionTestInvocation(
loop_index=0,
id=make_invocation_id(),
file_name=Path("tests/test_foo.py"),
did_pass=True,
runtime=100,
test_framework="pytest",
test_type=TestType.EXISTING_UNIT_TEST,
return_value=None,
cpu_runtime=0,
timed_out=False,
verification_type=VerificationType.INIT_STATE_FTO,
)
assert VerificationType.INIT_STATE_FTO == inv.verification_type
def test_frozen(self) -> None:
"""Cannot modify attributes on frozen instance."""
inv = make_invocation()
with pytest.raises(AttributeError):
inv.did_pass = False # type: ignore[misc]
class TestTestResults:
"""TestResults collection behavior."""
def test_add_and_len(self) -> None:
"""Adding an invocation increases length."""
results = TestResults()
results.add(make_invocation())
assert 1 == len(results)
def test_add_dedup(self) -> None:
"""Adding same uid twice only stores once."""
inv = make_invocation()
results = TestResults()
results.add(inv)
results.add(inv)
assert 1 == len(results)
def test_merge(self) -> None:
"""Merges two TestResults together."""
r1 = TestResults()
r1.add(make_invocation(loop_index=0))
r2 = TestResults()
r2.add(make_invocation(loop_index=1))
r1.merge(r2)
assert 2 == len(r1)
def test_merge_duplicate_raises(self) -> None:
"""Duplicate uid in merge raises ValueError."""
inv = make_invocation()
r1 = TestResults()
r1.add(inv)
r2 = TestResults()
r2.add(inv)
with pytest.raises(ValueError, match="Duplicate"):
r1.merge(r2)
def test_get_by_uid(self) -> None:
"""Lookup by unique_invocation_loop_id returns the invocation."""
inv = make_invocation()
results = TestResults()
results.add(inv)
found = results.get_by_unique_invocation_loop_id(
inv.unique_invocation_loop_id,
)
assert inv == found
def test_get_by_uid_missing(self) -> None:
"""Returns None for unknown uid."""
results = TestResults()
assert results.get_by_unique_invocation_loop_id("x") is None
def test_number_of_loops(self) -> None:
"""Returns max loop_index across all results."""
results = TestResults()
results.add(make_invocation(loop_index=0))
results.add(make_invocation(loop_index=3))
assert 3 == results.number_of_loops()
def test_number_of_loops_empty(self) -> None:
"""Returns 0 for empty results."""
assert 0 == TestResults().number_of_loops()
def test_total_passed_runtime(self) -> None:
"""Sum of minimum runtimes across passing test cases."""
inv_id = make_invocation_id()
results = TestResults()
results.add(
make_invocation(loop_index=0, inv_id=inv_id, runtime=200),
)
results.add(
make_invocation(loop_index=1, inv_id=inv_id, runtime=100),
)
assert 100 == results.total_passed_runtime()
def test_total_passed_runtime_excludes_failed(self) -> None:
"""Failed invocations are excluded from runtime sum."""
results = TestResults()
results.add(make_invocation(loop_index=0, runtime=200))
results.add(
make_invocation(
loop_index=1,
inv_id=make_invocation_id(func="test_fail"),
did_pass=False,
runtime=50,
),
)
assert 200 == results.total_passed_runtime()
def test_iter_and_bool(self) -> None:
"""Iteration yields invocations; empty is falsy, non-empty truthy."""
results = TestResults()
assert not results
inv = make_invocation()
results.add(inv)
assert results
assert [inv] == list(results)
def test_contains(self) -> None:
"""Invocation in results returns True."""
inv = make_invocation()
results = TestResults()
results.add(inv)
assert inv in results
def test_getitem(self) -> None:
"""Index access returns the correct invocation."""
inv = make_invocation()
results = TestResults()
results.add(inv)
assert inv == results[0]
class TestTestFile:
"""TestFile and TestFiles collection behavior."""
def test_get_test_type_by_instrumented_path(
self,
tmp_path: Path,
) -> None:
"""Finds matching test type by instrumented file path."""
instrumented = tmp_path / "instrumented_test.py"
instrumented.touch()
tf = TestFile(
original_file_path=tmp_path / "test_orig.py",
instrumented_behavior_file_path=instrumented,
test_type=TestType.GENERATED_REGRESSION,
)
files = TestFiles(test_files=[tf])
result = files.get_test_type_by_instrumented_file_path(
instrumented,
)
assert TestType.GENERATED_REGRESSION == result
def test_get_test_type_by_original_path(
self,
tmp_path: Path,
) -> None:
"""Finds test type by original file path."""
original = tmp_path / "test_orig.py"
original.touch()
tf = TestFile(original_file_path=original)
files = TestFiles(test_files=[tf])
result = files.get_test_type_by_original_file_path(original)
assert TestType.EXISTING_UNIT_TEST == result
def test_get_test_type_missing(self) -> None:
"""Returns None for unknown path."""
files = TestFiles()
result = files.get_test_type_by_instrumented_file_path(
Path("/nonexistent.py"),
)
assert result is None
class TestTestConfig:
"""TestConfig defaults and construction."""
def test_config_defaults(self) -> None:
"""test_framework defaults to 'pytest'."""
config = TestConfig(tests_project_rootdir=Path("/project"))
assert "pytest" == config.test_framework
assert "pytest" == config.pytest_cmd
def test_frozen(self) -> None:
"""Cannot modify attributes on frozen instance."""
config = TestConfig(tests_project_rootdir=Path("/project"))
with pytest.raises(AttributeError):
config.test_framework = "unittest" # type: ignore[misc]
def _make_replay_invocation(
*,
module: str,
func: str = "test_replay",
loop_index: int = 0,
runtime: int = 100,
) -> FunctionTestInvocation:
"""Create a REPLAY_TEST invocation with a given module path."""
return FunctionTestInvocation(
loop_index=loop_index,
id=InvocationId(
test_module_path=module,
test_class_name=None,
test_function_name=func,
function_getting_tested="target",
iteration_id="0",
),
file_name=Path("tests/test_replay.py"),
did_pass=True,
runtime=runtime,
test_framework="pytest",
test_type=TestType.REPLAY_TEST,
return_value=None,
cpu_runtime=0,
timed_out=False,
)
class TestGroupByBenchmarks:
"""TestResults.group_by_benchmarks grouping behaviour."""
def test_groups_replay_results_by_benchmark_key(
self,
tmp_path: Path,
) -> None:
"""Replay results are grouped under matching benchmark keys."""
project_root = tmp_path
replay_dir = tmp_path / "replay"
replay_dir.mkdir()
bk = BenchmarkKey(
module_path="benchmarks.test_sort",
function_name="sort_fn",
)
# module_name_from_file_path converts the replay dir path
# into a dotted prefix: replay/test_benchmarks_test_sort__replay_test_
# => replay.test_benchmarks_test_sort__replay_test_
expected_prefix = "replay.test_benchmarks_test_sort__replay_test_"
results = TestResults()
matching = _make_replay_invocation(
module=expected_prefix + "0",
runtime=200,
)
non_matching = _make_replay_invocation(
module="other.module",
func="test_other",
runtime=50,
)
results.add(matching)
results.add(non_matching)
grouped = results.group_by_benchmarks(
[bk],
replay_dir,
project_root,
)
assert bk in grouped
assert 1 == len(grouped[bk])
assert matching in grouped[bk]
def test_non_replay_results_are_excluded(
self,
tmp_path: Path,
) -> None:
"""Only REPLAY_TEST results are included in grouping."""
project_root = tmp_path
replay_dir = tmp_path / "replay"
replay_dir.mkdir()
bk = BenchmarkKey(
module_path="benchmarks.test_sort",
function_name="sort_fn",
)
prefix = "replay.test_benchmarks_test_sort__replay_test_"
results = TestResults()
# An existing unit test whose module path happens to match.
unit_inv = FunctionTestInvocation(
loop_index=0,
id=InvocationId(
test_module_path=prefix + "0",
test_class_name=None,
test_function_name="test_unit",
function_getting_tested="target",
iteration_id="0",
),
file_name=Path("tests/test_unit.py"),
did_pass=True,
runtime=100,
test_framework="pytest",
test_type=TestType.EXISTING_UNIT_TEST,
return_value=None,
cpu_runtime=0,
timed_out=False,
)
results.add(unit_inv)
grouped = results.group_by_benchmarks(
[bk],
replay_dir,
project_root,
)
assert 0 == len(grouped[bk])
def test_empty_results_returns_empty_groups(
self,
tmp_path: Path,
) -> None:
"""Empty TestResults produces empty groups."""
project_root = tmp_path
replay_dir = tmp_path / "replay"
replay_dir.mkdir()
bk = BenchmarkKey(
module_path="benchmarks.test_sort",
function_name="sort_fn",
)
results = TestResults()
grouped = results.group_by_benchmarks(
[bk],
replay_dir,
project_root,
)
assert 0 == len(grouped[bk])
def test_multiple_benchmark_keys(
self,
tmp_path: Path,
) -> None:
"""Results are correctly distributed across multiple keys."""
project_root = tmp_path
replay_dir = tmp_path / "replay"
replay_dir.mkdir()
bk_a = BenchmarkKey(
module_path="benchmarks.test_a",
function_name="fn_a",
)
bk_b = BenchmarkKey(
module_path="benchmarks.test_b",
function_name="fn_b",
)
prefix_a = "replay.test_benchmarks_test_a__replay_test_"
prefix_b = "replay.test_benchmarks_test_b__replay_test_"
results = TestResults()
inv_a = _make_replay_invocation(
module=prefix_a + "0",
func="test_a",
runtime=100,
)
inv_b = _make_replay_invocation(
module=prefix_b + "0",
func="test_b",
runtime=200,
)
results.add(inv_a)
results.add(inv_b)
grouped = results.group_by_benchmarks(
[bk_a, bk_b],
replay_dir,
project_root,
)
assert 1 == len(grouped[bk_a])
assert inv_a in grouped[bk_a]
assert 1 == len(grouped[bk_b])
assert inv_b in grouped[bk_b]