Rename TestDiff/TestDiffScope to BehaviorDiff/BehaviorDiffScope

These classes represent behavioral verification diffs, not tests. The
Test* prefix caused pytest to attempt collection and emit warnings.
This commit is contained in:
Kevin Turcios 2026-04-23 04:37:24 -05:00
parent 9e893675c9
commit 4f98b5421f
8 changed files with 91 additions and 91 deletions

View file

@ -16,22 +16,22 @@ from codeflash_api.languages.python._markdown import (
split_markdown_code, split_markdown_code,
) )
from codeflash_api.repair.schemas import ( from codeflash_api.repair.schemas import (
TestDiff, BehaviorDiff,
TestDiffScope, BehaviorDiffScope,
) )
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
SCOPE_DESCRIPTIONS: dict[TestDiffScope, str] = { SCOPE_DESCRIPTIONS: dict[BehaviorDiffScope, str] = {
TestDiffScope.RETURN_VALUE: ( BehaviorDiffScope.RETURN_VALUE: (
"The function returned a different value in the" "The function returned a different value in the"
" optimized code compared to the original." " optimized code compared to the original."
), ),
TestDiffScope.STDOUT: ( BehaviorDiffScope.STDOUT: (
"The output printed to stdout is different in the" "The output printed to stdout is different in the"
" optimized code compared to the original." " optimized code compared to the original."
), ),
TestDiffScope.DID_PASS: ( BehaviorDiffScope.DID_PASS: (
"The test passed in one version but failed in the" "The test passed in one version but failed in the"
" other (a change in pass/fail behavior)." " other (a change in pass/fail behavior)."
), ),
@ -39,7 +39,7 @@ SCOPE_DESCRIPTIONS: dict[TestDiffScope, str] = {
def build_test_details( def build_test_details(
test_diffs: list[TestDiff], test_diffs: list[BehaviorDiff],
language: str = "python", language: str = "python",
) -> str: ) -> str:
""" """
@ -59,7 +59,7 @@ def build_test_details(
def _format_single_diff( def _format_single_diff(
diff: TestDiff, diff: BehaviorDiff,
sections: defaultdict[str, list[str]], sections: defaultdict[str, list[str]],
seen_headers: set[str], seen_headers: set[str],
test_error_label: str, test_error_label: str,
@ -92,7 +92,7 @@ def _format_single_diff(
scope_desc = SCOPE_DESCRIPTIONS.get(diff.scope, diff.scope.value) scope_desc = SCOPE_DESCRIPTIONS.get(diff.scope, diff.scope.value)
detail_lines = [f"- {scope_desc}"] detail_lines = [f"- {scope_desc}"]
if diff.scope != TestDiffScope.DID_PASS: if diff.scope != BehaviorDiffScope.DID_PASS:
detail_lines.append(f" Expected: {diff.original_value!r}") detail_lines.append(f" Expected: {diff.original_value!r}")
detail_lines.append(f" Got: {diff.candidate_value!r}") detail_lines.append(f" Got: {diff.candidate_value!r}")
else: else:
@ -110,7 +110,7 @@ def build_user_prompt(
template: str, template: str,
original_source_code: str, original_source_code: str,
modified_source_code: str, modified_source_code: str,
test_diffs: list[TestDiff], test_diffs: list[BehaviorDiff],
language: str = "python", language: str = "python",
) -> str: ) -> str:
""" """

View file

@ -6,7 +6,7 @@ from typing import Any
from pydantic import BaseModel from pydantic import BaseModel
class TestDiffScope(str, enum.Enum): class BehaviorDiffScope(str, enum.Enum):
""" """
The dimension on which a test diff was observed. The dimension on which a test diff was observed.
""" """
@ -17,12 +17,12 @@ class TestDiffScope(str, enum.Enum):
TIMED_OUT = "timed_out" TIMED_OUT = "timed_out"
class TestDiff(BaseModel): class BehaviorDiff(BaseModel):
""" """
A single behavioural difference between original and optimised code. A single behavioural difference between original and optimised code.
""" """
scope: TestDiffScope scope: BehaviorDiffScope
original_value: ( original_value: (
bool | str | int | float | dict[str, Any] | list[Any] | None bool | str | int | float | dict[str, Any] | list[Any] | None
) = None ) = None
@ -45,7 +45,7 @@ class CodeRepairRequest(BaseModel):
optimization_id: str optimization_id: str
original_source_code: str original_source_code: str
modified_source_code: str modified_source_code: str
test_diffs: list[TestDiff] test_diffs: list[BehaviorDiff]
language: str = "python" language: str = "python"
rerun_trace_id: str | None = None rerun_trace_id: str | None = None

View file

@ -12,8 +12,8 @@ from codeflash_api.repair._context import (
) )
from codeflash_api.repair.schemas import ( from codeflash_api.repair.schemas import (
CodeRepairRequest, CodeRepairRequest,
TestDiff, BehaviorDiff,
TestDiffScope, BehaviorDiffScope,
) )
# ------------------------------------------------------------------- # -------------------------------------------------------------------
@ -42,8 +42,8 @@ class TestCodeRepairRequest:
""" """
A request with all fields deserializes. A request with all fields deserializes.
""" """
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value="1", original_value="1",
candidate_value="2", candidate_value="2",
original_pass=True, original_pass=True,
@ -60,18 +60,18 @@ class TestCodeRepairRequest:
rerun_trace_id=str(uuid.uuid4()), rerun_trace_id=str(uuid.uuid4()),
) )
assert 1 == len(req.test_diffs) assert 1 == len(req.test_diffs)
assert TestDiffScope.RETURN_VALUE == req.test_diffs[0].scope assert BehaviorDiffScope.RETURN_VALUE == req.test_diffs[0].scope
class TestTestDiff: class TestBehaviorDiff:
"""Tests for TestDiff schema.""" """Tests for BehaviorDiff schema."""
def test_all_scopes(self): def test_all_scopes(self):
""" """
Every scope enum value can be used. Every scope enum value can be used.
""" """
for scope in TestDiffScope: for scope in BehaviorDiffScope:
diff = TestDiff( diff = BehaviorDiff(
scope=scope, scope=scope,
original_pass=True, original_pass=True,
candidate_pass=False, candidate_pass=False,
@ -82,8 +82,8 @@ class TestTestDiff:
""" """
Optional fields default to None. Optional fields default to None.
""" """
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.DID_PASS, scope=BehaviorDiffScope.DID_PASS,
original_pass=True, original_pass=True,
candidate_pass=False, candidate_pass=False,
) )
@ -112,8 +112,8 @@ class TestBuildTestDetails:
""" """
Return value diffs show Expected/Got lines. Return value diffs show Expected/Got lines.
""" """
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value="hello", original_value="hello",
candidate_value="world", candidate_value="world",
original_pass=True, original_pass=True,
@ -129,8 +129,8 @@ class TestBuildTestDetails:
""" """
DID_PASS diffs show pass/fail status. DID_PASS diffs show pass/fail status.
""" """
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.DID_PASS, scope=BehaviorDiffScope.DID_PASS,
original_pass=True, original_pass=True,
candidate_pass=False, candidate_pass=False,
test_src_code="test_something", test_src_code="test_something",
@ -143,8 +143,8 @@ class TestBuildTestDetails:
""" """
Pytest errors appear in the output when present. Pytest errors appear in the output when present.
""" """
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=1, original_value=1,
candidate_value=2, candidate_value=2,
original_pass=True, original_pass=True,
@ -161,16 +161,16 @@ class TestBuildTestDetails:
Multiple diffs for the same test source share a header. Multiple diffs for the same test source share a header.
""" """
diffs = [ diffs = [
TestDiff( BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=1, original_value=1,
candidate_value=2, candidate_value=2,
original_pass=True, original_pass=True,
candidate_pass=False, candidate_pass=False,
test_src_code="test_func", test_src_code="test_func",
), ),
TestDiff( BehaviorDiff(
scope=TestDiffScope.STDOUT, scope=BehaviorDiffScope.STDOUT,
original_value="out", original_value="out",
candidate_value="err", candidate_value="err",
original_pass=True, original_pass=True,
@ -186,16 +186,16 @@ class TestBuildTestDetails:
Different test sources get separate sections. Different test sources get separate sections.
""" """
diffs = [ diffs = [
TestDiff( BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=1, original_value=1,
candidate_value=2, candidate_value=2,
original_pass=True, original_pass=True,
candidate_pass=False, candidate_pass=False,
test_src_code="test_a", test_src_code="test_a",
), ),
TestDiff( BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=3, original_value=3,
candidate_value=4, candidate_value=4,
original_pass=True, original_pass=True,
@ -211,8 +211,8 @@ class TestBuildTestDetails:
""" """
Non-python language uses 'Test error' label. Non-python language uses 'Test error' label.
""" """
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=1, original_value=1,
candidate_value=2, candidate_value=2,
original_pass=True, original_pass=True,
@ -227,8 +227,8 @@ class TestBuildTestDetails:
""" """
Missing test source shows 'Not available'. Missing test source shows 'Not available'.
""" """
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=1, original_value=1,
candidate_value=2, candidate_value=2,
original_pass=True, original_pass=True,
@ -255,8 +255,8 @@ class TestBuildUserPrompt:
"Modified: {modified_source_code}\n" "Modified: {modified_source_code}\n"
"Tests: {test_details}" "Tests: {test_details}"
) )
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=1, original_value=1,
candidate_value=2, candidate_value=2,
original_pass=True, original_pass=True,

View file

@ -3,17 +3,17 @@
from ._baseline import establish_original_code_baseline from ._baseline import establish_original_code_baseline
from ._verification import compare_test_results from ._verification import compare_test_results
from .models import ( from .models import (
BehaviorDiff,
BehaviorDiffScope,
OptimizedCandidateResult, OptimizedCandidateResult,
OriginalCodeBaseline, OriginalCodeBaseline,
TestDiff,
TestDiffScope,
) )
__all__ = [ __all__ = [
"BehaviorDiff",
"BehaviorDiffScope",
"OptimizedCandidateResult", "OptimizedCandidateResult",
"OriginalCodeBaseline", "OriginalCodeBaseline",
"TestDiff",
"TestDiffScope",
"compare_test_results", "compare_test_results",
"establish_original_code_baseline", "establish_original_code_baseline",
] ]

View file

@ -11,7 +11,7 @@ from typing import TYPE_CHECKING
from .._model import VerificationType from .._model import VerificationType
from ..test_discovery.models import TestType from ..test_discovery.models import TestType
from ._comparator import comparator from ._comparator import comparator
from .models import TestDiff, TestDiffScope from .models import BehaviorDiff, BehaviorDiffScope
if TYPE_CHECKING: if TYPE_CHECKING:
from ..testing.models import TestResults from ..testing.models import TestResults
@ -22,7 +22,7 @@ INCREASED_RECURSION_LIMIT = 5000
_reprlib_repr = reprlib.Repr() _reprlib_repr = reprlib.Repr()
_reprlib_repr.maxstring = 1500 _reprlib_repr.maxstring = 1500
_test_diff_repr = _reprlib_repr.repr _behavior_diff_repr = _reprlib_repr.repr
def safe_repr(obj: object) -> str: def safe_repr(obj: object) -> str:
@ -48,7 +48,7 @@ def compare_test_results( # noqa: C901, PLR0912
original_results: TestResults, original_results: TestResults,
candidate_results: TestResults, candidate_results: TestResults,
pass_fail_only: bool = False, # noqa: FBT001, FBT002 pass_fail_only: bool = False, # noqa: FBT001, FBT002
) -> tuple[bool, list[TestDiff]]: ) -> tuple[bool, list[BehaviorDiff]]:
"""Compare original and candidate test results for behavioral equivalence. """Compare original and candidate test results for behavioral equivalence.
Returns a tuple of (all_match, diffs). When *pass_fail_only* is True, Returns a tuple of (all_match, diffs). When *pass_fail_only* is True,
@ -66,7 +66,7 @@ def compare_test_results( # noqa: C901, PLR0912
| candidate_results.get_all_unique_invocation_loop_ids() | candidate_results.get_all_unique_invocation_loop_ids()
) )
test_diffs: list[TestDiff] = [] test_diffs: list[BehaviorDiff] = []
did_all_timeout = True did_all_timeout = True
for test_id in test_ids_superset: for test_id in test_ids_superset:
@ -143,8 +143,8 @@ def compare_test_results( # noqa: C901, PLR0912
TestType.REPLAY_TEST, TestType.REPLAY_TEST,
} and (cdd_test_result.did_pass != original_test_result.did_pass): } and (cdd_test_result.did_pass != original_test_result.did_pass):
test_diffs.append( test_diffs.append(
TestDiff( BehaviorDiff(
scope=TestDiffScope.DID_PASS, scope=BehaviorDiffScope.DID_PASS,
original_value=str(original_test_result.did_pass), original_value=str(original_test_result.did_pass),
candidate_value=str(cdd_test_result.did_pass), candidate_value=str(cdd_test_result.did_pass),
test_src_code=( test_src_code=(
@ -164,14 +164,14 @@ def compare_test_results( # noqa: C901, PLR0912
superset_obj=superset_obj, superset_obj=superset_obj,
): ):
test_diffs.append( test_diffs.append(
TestDiff( BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_value=_test_diff_repr( original_value=_behavior_diff_repr(
safe_repr( safe_repr(
original_test_result.return_value, original_test_result.return_value,
), ),
), ),
candidate_value=_test_diff_repr( candidate_value=_behavior_diff_repr(
safe_repr( safe_repr(
cdd_test_result.return_value, cdd_test_result.return_value,
), ),
@ -214,8 +214,8 @@ def compare_test_results( # noqa: C901, PLR0912
) )
): ):
test_diffs.append( test_diffs.append(
TestDiff( BehaviorDiff(
scope=TestDiffScope.STDOUT, scope=BehaviorDiffScope.STDOUT,
original_value=str(original_test_result.stdout), original_value=str(original_test_result.stdout),
candidate_value=str(cdd_test_result.stdout), candidate_value=str(cdd_test_result.stdout),
test_src_code=( test_src_code=(

View file

@ -14,7 +14,7 @@ if TYPE_CHECKING:
from ..testing.models import TestResults from ..testing.models import TestResults
class TestDiffScope(str, enum.Enum): class BehaviorDiffScope(str, enum.Enum):
"""Scope of a behavioral difference between original and candidate.""" """Scope of a behavioral difference between original and candidate."""
RETURN_VALUE = "return_value" RETURN_VALUE = "return_value"
@ -23,10 +23,10 @@ class TestDiffScope(str, enum.Enum):
@attrs.frozen @attrs.frozen
class TestDiff: class BehaviorDiff:
"""A single behavioral difference between original and candidate.""" """A single behavioral difference between original and candidate."""
scope: TestDiffScope scope: BehaviorDiffScope
original_pass: bool original_pass: bool
candidate_pass: bool candidate_pass: bool
original_value: str | None = None original_value: str | None = None

View file

@ -501,16 +501,16 @@ class TestRunTestsAndBenchmark:
) -> None: ) -> None:
"""When behavioral tests fail with diffs, diffs are stored.""" """When behavioral tests fail with diffs, diffs are stored."""
from codeflash_python.verification.models import ( from codeflash_python.verification.models import (
TestDiff, BehaviorDiff,
TestDiffScope, BehaviorDiffScope,
) )
eval_ctx = EvaluationContext() eval_ctx = EvaluationContext()
failed_diffs: dict[str, list[Any]] = {} failed_diffs: dict[str, list[Any]] = {}
bench_results: dict[str, TestResults] = {} bench_results: dict[str, TestResults] = {}
diff_obj = TestDiff( diff_obj = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_pass=True, original_pass=True,
candidate_pass=True, candidate_pass=True,
original_value="42", original_value="42",

View file

@ -18,8 +18,8 @@ from codeflash_python.verification._verification import (
) )
from codeflash_python.verification.models import ( from codeflash_python.verification.models import (
OptimizedCandidateResult, OptimizedCandidateResult,
TestDiff, BehaviorDiff,
TestDiffScope, BehaviorDiffScope,
) )
@ -125,7 +125,7 @@ class TestCompareTestResults:
assert match is False assert match is False
assert 1 == len(diffs) assert 1 == len(diffs)
assert TestDiffScope.DID_PASS == diffs[0].scope assert BehaviorDiffScope.DID_PASS == diffs[0].scope
assert diffs[0].original_pass is True assert diffs[0].original_pass is True
assert diffs[0].candidate_pass is False assert diffs[0].candidate_pass is False
@ -142,7 +142,7 @@ class TestCompareTestResults:
assert match is False assert match is False
assert 1 == len(diffs) assert 1 == len(diffs)
assert TestDiffScope.RETURN_VALUE == diffs[0].scope assert BehaviorDiffScope.RETURN_VALUE == diffs[0].scope
def test_stdout_mismatch(self) -> None: def test_stdout_mismatch(self) -> None:
"""Same return values but different stdout produces STDOUT diff.""" """Same return values but different stdout produces STDOUT diff."""
@ -165,7 +165,7 @@ class TestCompareTestResults:
assert match is False assert match is False
assert 1 == len(diffs) assert 1 == len(diffs)
assert TestDiffScope.STDOUT == diffs[0].scope assert BehaviorDiffScope.STDOUT == diffs[0].scope
def test_pass_fail_only_skips_return_values(self) -> None: def test_pass_fail_only_skips_return_values(self) -> None:
"""When pass_fail_only=True, return value diffs are ignored.""" """When pass_fail_only=True, return value diffs are ignored."""
@ -265,7 +265,7 @@ class TestCompareTestResults:
assert [] == diffs assert [] == diffs
def test_multiple_diffs_collected(self) -> None: def test_multiple_diffs_collected(self) -> None:
"""Multiple mismatches produce multiple TestDiff entries.""" """Multiple mismatches produce multiple BehaviorDiff entries."""
original = make_results( original = make_results(
make_invocation( make_invocation(
test_function="test_a", test_function="test_a",
@ -296,8 +296,8 @@ class TestCompareTestResults:
assert match is False assert match is False
assert 2 == len(diffs) assert 2 == len(diffs)
scopes = {d.scope for d in diffs} scopes = {d.scope for d in diffs}
assert TestDiffScope.DID_PASS in scopes assert BehaviorDiffScope.DID_PASS in scopes
assert TestDiffScope.RETURN_VALUE in scopes assert BehaviorDiffScope.RETURN_VALUE in scopes
class TestPerformanceGain: class TestPerformanceGain:
@ -348,23 +348,23 @@ class TestPerformanceGain:
assert result < 0.01 assert result < 0.01
class TestTestDiffScope: class TestBehaviorDiffScope:
"""TestDiffScope enum values.""" """BehaviorDiffScope enum values."""
def test_values(self) -> None: def test_values(self) -> None:
"""The three enum values exist with expected string values.""" """The three enum values exist with expected string values."""
assert "return_value" == TestDiffScope.RETURN_VALUE.value assert "return_value" == BehaviorDiffScope.RETURN_VALUE.value
assert "stdout" == TestDiffScope.STDOUT.value assert "stdout" == BehaviorDiffScope.STDOUT.value
assert "did_pass" == TestDiffScope.DID_PASS.value assert "did_pass" == BehaviorDiffScope.DID_PASS.value
class TestTestDiff: class TestBehaviorDiff:
"""TestDiff frozen data class.""" """BehaviorDiff frozen data class."""
def test_construction(self) -> None: def test_construction(self) -> None:
"""Can construct with all fields.""" """Can construct with all fields."""
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.RETURN_VALUE, scope=BehaviorDiffScope.RETURN_VALUE,
original_pass=True, original_pass=True,
candidate_pass=True, candidate_pass=True,
original_value="42", original_value="42",
@ -374,7 +374,7 @@ class TestTestDiff:
original_pytest_error=None, original_pytest_error=None,
) )
assert TestDiffScope.RETURN_VALUE == diff.scope assert BehaviorDiffScope.RETURN_VALUE == diff.scope
assert diff.original_pass is True assert diff.original_pass is True
assert diff.candidate_pass is True assert diff.candidate_pass is True
assert "42" == diff.original_value assert "42" == diff.original_value
@ -385,19 +385,19 @@ class TestTestDiff:
def test_frozen(self) -> None: def test_frozen(self) -> None:
"""Raises on attribute assignment.""" """Raises on attribute assignment."""
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.DID_PASS, scope=BehaviorDiffScope.DID_PASS,
original_pass=True, original_pass=True,
candidate_pass=False, candidate_pass=False,
) )
with pytest.raises(attrs.exceptions.FrozenInstanceError): with pytest.raises(attrs.exceptions.FrozenInstanceError):
diff.scope = TestDiffScope.STDOUT # type: ignore[misc] diff.scope = BehaviorDiffScope.STDOUT # type: ignore[misc]
def test_default_none_fields(self) -> None: def test_default_none_fields(self) -> None:
"""Optional fields default to None.""" """Optional fields default to None."""
diff = TestDiff( diff = BehaviorDiff(
scope=TestDiffScope.STDOUT, scope=BehaviorDiffScope.STDOUT,
original_pass=True, original_pass=True,
candidate_pass=True, candidate_pass=True,
) )