Rename TestDiff/TestDiffScope to BehaviorDiff/BehaviorDiffScope

These classes represent behavioral verification diffs, not tests. The Test* prefix caused pytest to attempt collection and emit warnings.
2026-05-04 18:25:19 +00:00 · 2026-04-23 04:37:24 -05:00 · 2026-04-23 04:37:24 -05:00 · 4f98b5421f
commit 4f98b5421f
parent 9e893675c9
8 changed files with 91 additions and 91 deletions
--- a/packages/codeflash-api/src/codeflash_api/repair/_context.py
+++ b/packages/codeflash-api/src/codeflash_api/repair/_context.py
@ -16,22 +16,22 @@ from codeflash_api.languages.python._markdown import (
    split_markdown_code,
 )
 from codeflash_api.repair.schemas import (
-    TestDiff,
-    TestDiffScope,
+    BehaviorDiff,
+    BehaviorDiffScope,
 )

 log = logging.getLogger(__name__)

-SCOPE_DESCRIPTIONS: dict[TestDiffScope, str] = {
-    TestDiffScope.RETURN_VALUE: (
+SCOPE_DESCRIPTIONS: dict[BehaviorDiffScope, str] = {
+    BehaviorDiffScope.RETURN_VALUE: (
        "The function returned a different value in the"
        " optimized code compared to the original."
    ),
-    TestDiffScope.STDOUT: (
+    BehaviorDiffScope.STDOUT: (
        "The output printed to stdout is different in the"
        " optimized code compared to the original."
    ),
-    TestDiffScope.DID_PASS: (
+    BehaviorDiffScope.DID_PASS: (
        "The test passed in one version but failed in the"
        " other (a change in pass/fail behavior)."
    ),
@ -39,7 +39,7 @@ SCOPE_DESCRIPTIONS: dict[TestDiffScope, str] = {


 def build_test_details(
-    test_diffs: list[TestDiff],
+    test_diffs: list[BehaviorDiff],
    language: str = "python",
 ) -> str:
    """
@ -59,7 +59,7 @@ def build_test_details(


 def _format_single_diff(
-    diff: TestDiff,
+    diff: BehaviorDiff,
    sections: defaultdict[str, list[str]],
    seen_headers: set[str],
    test_error_label: str,
@ -92,7 +92,7 @@ def _format_single_diff(

        scope_desc = SCOPE_DESCRIPTIONS.get(diff.scope, diff.scope.value)
        detail_lines = [f"- {scope_desc}"]
-        if diff.scope != TestDiffScope.DID_PASS:
+        if diff.scope != BehaviorDiffScope.DID_PASS:
            detail_lines.append(f"  Expected: {diff.original_value!r}")
            detail_lines.append(f"  Got:      {diff.candidate_value!r}")
        else:
@ -110,7 +110,7 @@ def build_user_prompt(
    template: str,
    original_source_code: str,
    modified_source_code: str,
-    test_diffs: list[TestDiff],
+    test_diffs: list[BehaviorDiff],
    language: str = "python",
 ) -> str:
    """
--- a/packages/codeflash-api/src/codeflash_api/repair/schemas.py
+++ b/packages/codeflash-api/src/codeflash_api/repair/schemas.py
@ -6,7 +6,7 @@ from typing import Any
 from pydantic import BaseModel


-class TestDiffScope(str, enum.Enum):
+class BehaviorDiffScope(str, enum.Enum):
    """
    The dimension on which a test diff was observed.
    """
@ -17,12 +17,12 @@ class TestDiffScope(str, enum.Enum):
    TIMED_OUT = "timed_out"


-class TestDiff(BaseModel):
+class BehaviorDiff(BaseModel):
    """
    A single behavioural difference between original and optimised code.
    """

-    scope: TestDiffScope
+    scope: BehaviorDiffScope
    original_value: (
        bool | str | int | float | dict[str, Any] | list[Any] | None
    ) = None
@ -45,7 +45,7 @@ class CodeRepairRequest(BaseModel):
    optimization_id: str
    original_source_code: str
    modified_source_code: str
-    test_diffs: list[TestDiff]
+    test_diffs: list[BehaviorDiff]
    language: str = "python"
    rerun_trace_id: str | None = None

--- a/packages/codeflash-api/tests/test_repair.py
+++ b/packages/codeflash-api/tests/test_repair.py
@ -12,8 +12,8 @@ from codeflash_api.repair._context import (
 )
 from codeflash_api.repair.schemas import (
    CodeRepairRequest,
-    TestDiff,
-    TestDiffScope,
+    BehaviorDiff,
+    BehaviorDiffScope,
 )

 # -------------------------------------------------------------------
@ -42,8 +42,8 @@ class TestCodeRepairRequest:
        """
        A request with all fields deserializes.
        """
-        diff = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_value="1",
            candidate_value="2",
            original_pass=True,
@ -60,18 +60,18 @@ class TestCodeRepairRequest:
            rerun_trace_id=str(uuid.uuid4()),
        )
        assert 1 == len(req.test_diffs)
-        assert TestDiffScope.RETURN_VALUE == req.test_diffs[0].scope
+        assert BehaviorDiffScope.RETURN_VALUE == req.test_diffs[0].scope


-class TestTestDiff:
-    """Tests for TestDiff schema."""
+class TestBehaviorDiff:
+    """Tests for BehaviorDiff schema."""

    def test_all_scopes(self):
        """
        Every scope enum value can be used.
        """
-        for scope in TestDiffScope:
-            diff = TestDiff(
+        for scope in BehaviorDiffScope:
+            diff = BehaviorDiff(
                scope=scope,
                original_pass=True,
                candidate_pass=False,
@ -82,8 +82,8 @@ class TestTestDiff:
        """
        Optional fields default to None.
        """
-        diff = TestDiff(
-            scope=TestDiffScope.DID_PASS,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.DID_PASS,
            original_pass=True,
            candidate_pass=False,
        )
@ -112,8 +112,8 @@ class TestBuildTestDetails:
        """
        Return value diffs show Expected/Got lines.
        """
-        diff = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_value="hello",
            candidate_value="world",
            original_pass=True,
@ -129,8 +129,8 @@ class TestBuildTestDetails:
        """
        DID_PASS diffs show pass/fail status.
        """
-        diff = TestDiff(
-            scope=TestDiffScope.DID_PASS,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.DID_PASS,
            original_pass=True,
            candidate_pass=False,
            test_src_code="test_something",
@ -143,8 +143,8 @@ class TestBuildTestDetails:
        """
        Pytest errors appear in the output when present.
        """
-        diff = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_value=1,
            candidate_value=2,
            original_pass=True,
@ -161,16 +161,16 @@ class TestBuildTestDetails:
        Multiple diffs for the same test source share a header.
        """
        diffs = [
-            TestDiff(
-                scope=TestDiffScope.RETURN_VALUE,
+            BehaviorDiff(
+                scope=BehaviorDiffScope.RETURN_VALUE,
                original_value=1,
                candidate_value=2,
                original_pass=True,
                candidate_pass=False,
                test_src_code="test_func",
            ),
-            TestDiff(
-                scope=TestDiffScope.STDOUT,
+            BehaviorDiff(
+                scope=BehaviorDiffScope.STDOUT,
                original_value="out",
                candidate_value="err",
                original_pass=True,
@ -186,16 +186,16 @@ class TestBuildTestDetails:
        Different test sources get separate sections.
        """
        diffs = [
-            TestDiff(
-                scope=TestDiffScope.RETURN_VALUE,
+            BehaviorDiff(
+                scope=BehaviorDiffScope.RETURN_VALUE,
                original_value=1,
                candidate_value=2,
                original_pass=True,
                candidate_pass=False,
                test_src_code="test_a",
            ),
-            TestDiff(
-                scope=TestDiffScope.RETURN_VALUE,
+            BehaviorDiff(
+                scope=BehaviorDiffScope.RETURN_VALUE,
                original_value=3,
                candidate_value=4,
                original_pass=True,
@ -211,8 +211,8 @@ class TestBuildTestDetails:
        """
        Non-python language uses 'Test error' label.
        """
-        diff = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_value=1,
            candidate_value=2,
            original_pass=True,
@ -227,8 +227,8 @@ class TestBuildTestDetails:
        """
        Missing test source shows 'Not available'.
        """
-        diff = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_value=1,
            candidate_value=2,
            original_pass=True,
@ -255,8 +255,8 @@ class TestBuildUserPrompt:
            "Modified: {modified_source_code}\n"
            "Tests: {test_details}"
        )
-        diff = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_value=1,
            candidate_value=2,
            original_pass=True,
--- a/packages/codeflash-python/src/codeflash_python/verification/init.py
+++ b/packages/codeflash-python/src/codeflash_python/verification/init.py
@ -3,17 +3,17 @@
 from ._baseline import establish_original_code_baseline
 from ._verification import compare_test_results
 from .models import (
+    BehaviorDiff,
+    BehaviorDiffScope,
    OptimizedCandidateResult,
    OriginalCodeBaseline,
-    TestDiff,
-    TestDiffScope,
 )

 __all__ = [
+    "BehaviorDiff",
+    "BehaviorDiffScope",
    "OptimizedCandidateResult",
    "OriginalCodeBaseline",
-    "TestDiff",
-    "TestDiffScope",
    "compare_test_results",
    "establish_original_code_baseline",
 ]
--- a/packages/codeflash-python/src/codeflash_python/verification/_verification.py
+++ b/packages/codeflash-python/src/codeflash_python/verification/_verification.py
@ -11,7 +11,7 @@ from typing import TYPE_CHECKING
 from .._model import VerificationType
 from ..test_discovery.models import TestType
 from ._comparator import comparator
-from .models import TestDiff, TestDiffScope
+from .models import BehaviorDiff, BehaviorDiffScope

 if TYPE_CHECKING:
    from ..testing.models import TestResults
@ -22,7 +22,7 @@ INCREASED_RECURSION_LIMIT = 5000

 _reprlib_repr = reprlib.Repr()
 _reprlib_repr.maxstring = 1500
-_test_diff_repr = _reprlib_repr.repr
+_behavior_diff_repr = _reprlib_repr.repr


 def safe_repr(obj: object) -> str:
@ -48,7 +48,7 @@ def compare_test_results(  # noqa: C901, PLR0912
    original_results: TestResults,
    candidate_results: TestResults,
    pass_fail_only: bool = False,  # noqa: FBT001, FBT002
-) -> tuple[bool, list[TestDiff]]:
+) -> tuple[bool, list[BehaviorDiff]]:
    """Compare original and candidate test results for behavioral equivalence.

    Returns a tuple of (all_match, diffs).  When *pass_fail_only* is True,
@ -66,7 +66,7 @@ def compare_test_results(  # noqa: C901, PLR0912
        | candidate_results.get_all_unique_invocation_loop_ids()
    )

-    test_diffs: list[TestDiff] = []
+    test_diffs: list[BehaviorDiff] = []
    did_all_timeout = True

    for test_id in test_ids_superset:
@ -143,8 +143,8 @@ def compare_test_results(  # noqa: C901, PLR0912
            TestType.REPLAY_TEST,
        } and (cdd_test_result.did_pass != original_test_result.did_pass):
            test_diffs.append(
-                TestDiff(
-                    scope=TestDiffScope.DID_PASS,
+                BehaviorDiff(
+                    scope=BehaviorDiffScope.DID_PASS,
                    original_value=str(original_test_result.did_pass),
                    candidate_value=str(cdd_test_result.did_pass),
                    test_src_code=(
@ -164,14 +164,14 @@ def compare_test_results(  # noqa: C901, PLR0912
            superset_obj=superset_obj,
        ):
            test_diffs.append(
-                TestDiff(
-                    scope=TestDiffScope.RETURN_VALUE,
-                    original_value=_test_diff_repr(
+                BehaviorDiff(
+                    scope=BehaviorDiffScope.RETURN_VALUE,
+                    original_value=_behavior_diff_repr(
                        safe_repr(
                            original_test_result.return_value,
                        ),
                    ),
-                    candidate_value=_test_diff_repr(
+                    candidate_value=_behavior_diff_repr(
                        safe_repr(
                            cdd_test_result.return_value,
                        ),
@ -214,8 +214,8 @@ def compare_test_results(  # noqa: C901, PLR0912
            )
        ):
            test_diffs.append(
-                TestDiff(
-                    scope=TestDiffScope.STDOUT,
+                BehaviorDiff(
+                    scope=BehaviorDiffScope.STDOUT,
                    original_value=str(original_test_result.stdout),
                    candidate_value=str(cdd_test_result.stdout),
                    test_src_code=(
--- a/packages/codeflash-python/src/codeflash_python/verification/models.py
+++ b/packages/codeflash-python/src/codeflash_python/verification/models.py
@ -14,7 +14,7 @@ if TYPE_CHECKING:
    from ..testing.models import TestResults


-class TestDiffScope(str, enum.Enum):
+class BehaviorDiffScope(str, enum.Enum):
    """Scope of a behavioral difference between original and candidate."""

    RETURN_VALUE = "return_value"
@ -23,10 +23,10 @@ class TestDiffScope(str, enum.Enum):


@attrs.frozen
-class TestDiff:
+class BehaviorDiff:
    """A single behavioral difference between original and candidate."""

-    scope: TestDiffScope
+    scope: BehaviorDiffScope
    original_pass: bool
    candidate_pass: bool
    original_value: str | None = None
--- a/packages/codeflash-python/tests/test_candidate_eval.py
+++ b/packages/codeflash-python/tests/test_candidate_eval.py
@ -501,16 +501,16 @@ class TestRunTestsAndBenchmark:
    ) -> None:
        """When behavioral tests fail with diffs, diffs are stored."""
        from codeflash_python.verification.models import (
-            TestDiff,
-            TestDiffScope,
+            BehaviorDiff,
+            BehaviorDiffScope,
        )

        eval_ctx = EvaluationContext()
        failed_diffs: dict[str, list[Any]] = {}
        bench_results: dict[str, TestResults] = {}

-        diff_obj = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff_obj = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_pass=True,
            candidate_pass=True,
            original_value="42",
--- a/packages/codeflash-python/tests/test_verification.py
+++ b/packages/codeflash-python/tests/test_verification.py
@ -18,8 +18,8 @@ from codeflash_python.verification._verification import (
 )
 from codeflash_python.verification.models import (
    OptimizedCandidateResult,
-    TestDiff,
-    TestDiffScope,
+    BehaviorDiff,
+    BehaviorDiffScope,
 )


@ -125,7 +125,7 @@ class TestCompareTestResults:

        assert match is False
        assert 1 == len(diffs)
-        assert TestDiffScope.DID_PASS == diffs[0].scope
+        assert BehaviorDiffScope.DID_PASS == diffs[0].scope
        assert diffs[0].original_pass is True
        assert diffs[0].candidate_pass is False

@ -142,7 +142,7 @@ class TestCompareTestResults:

        assert match is False
        assert 1 == len(diffs)
-        assert TestDiffScope.RETURN_VALUE == diffs[0].scope
+        assert BehaviorDiffScope.RETURN_VALUE == diffs[0].scope

    def test_stdout_mismatch(self) -> None:
        """Same return values but different stdout produces STDOUT diff."""
@ -165,7 +165,7 @@ class TestCompareTestResults:

        assert match is False
        assert 1 == len(diffs)
-        assert TestDiffScope.STDOUT == diffs[0].scope
+        assert BehaviorDiffScope.STDOUT == diffs[0].scope

    def test_pass_fail_only_skips_return_values(self) -> None:
        """When pass_fail_only=True, return value diffs are ignored."""
@ -265,7 +265,7 @@ class TestCompareTestResults:
        assert [] == diffs

    def test_multiple_diffs_collected(self) -> None:
-        """Multiple mismatches produce multiple TestDiff entries."""
+        """Multiple mismatches produce multiple BehaviorDiff entries."""
        original = make_results(
            make_invocation(
                test_function="test_a",
@ -296,8 +296,8 @@ class TestCompareTestResults:
        assert match is False
        assert 2 == len(diffs)
        scopes = {d.scope for d in diffs}
-        assert TestDiffScope.DID_PASS in scopes
-        assert TestDiffScope.RETURN_VALUE in scopes
+        assert BehaviorDiffScope.DID_PASS in scopes
+        assert BehaviorDiffScope.RETURN_VALUE in scopes


 class TestPerformanceGain:
@ -348,23 +348,23 @@ class TestPerformanceGain:
        assert result < 0.01


-class TestTestDiffScope:
-    """TestDiffScope enum values."""
+class TestBehaviorDiffScope:
+    """BehaviorDiffScope enum values."""

    def test_values(self) -> None:
        """The three enum values exist with expected string values."""
-        assert "return_value" == TestDiffScope.RETURN_VALUE.value
-        assert "stdout" == TestDiffScope.STDOUT.value
-        assert "did_pass" == TestDiffScope.DID_PASS.value
+        assert "return_value" == BehaviorDiffScope.RETURN_VALUE.value
+        assert "stdout" == BehaviorDiffScope.STDOUT.value
+        assert "did_pass" == BehaviorDiffScope.DID_PASS.value


-class TestTestDiff:
-    """TestDiff frozen data class."""
+class TestBehaviorDiff:
+    """BehaviorDiff frozen data class."""

    def test_construction(self) -> None:
        """Can construct with all fields."""
-        diff = TestDiff(
-            scope=TestDiffScope.RETURN_VALUE,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.RETURN_VALUE,
            original_pass=True,
            candidate_pass=True,
            original_value="42",
@ -374,7 +374,7 @@ class TestTestDiff:
            original_pytest_error=None,
        )

-        assert TestDiffScope.RETURN_VALUE == diff.scope
+        assert BehaviorDiffScope.RETURN_VALUE == diff.scope
        assert diff.original_pass is True
        assert diff.candidate_pass is True
        assert "42" == diff.original_value
@ -385,19 +385,19 @@ class TestTestDiff:

    def test_frozen(self) -> None:
        """Raises on attribute assignment."""
-        diff = TestDiff(
-            scope=TestDiffScope.DID_PASS,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.DID_PASS,
            original_pass=True,
            candidate_pass=False,
        )

        with pytest.raises(attrs.exceptions.FrozenInstanceError):
-            diff.scope = TestDiffScope.STDOUT  # type: ignore[misc]
+            diff.scope = BehaviorDiffScope.STDOUT  # type: ignore[misc]

    def test_default_none_fields(self) -> None:
        """Optional fields default to None."""
-        diff = TestDiff(
-            scope=TestDiffScope.STDOUT,
+        diff = BehaviorDiff(
+            scope=BehaviorDiffScope.STDOUT,
            original_pass=True,
            candidate_pass=True,
        )