mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
Rename TestDiff/TestDiffScope to BehaviorDiff/BehaviorDiffScope
These classes represent behavioral verification diffs, not tests. The Test* prefix caused pytest to attempt collection and emit warnings.
This commit is contained in:
parent
9e893675c9
commit
4f98b5421f
8 changed files with 91 additions and 91 deletions
|
|
@ -16,22 +16,22 @@ from codeflash_api.languages.python._markdown import (
|
|||
split_markdown_code,
|
||||
)
|
||||
from codeflash_api.repair.schemas import (
|
||||
TestDiff,
|
||||
TestDiffScope,
|
||||
BehaviorDiff,
|
||||
BehaviorDiffScope,
|
||||
)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
SCOPE_DESCRIPTIONS: dict[TestDiffScope, str] = {
|
||||
TestDiffScope.RETURN_VALUE: (
|
||||
SCOPE_DESCRIPTIONS: dict[BehaviorDiffScope, str] = {
|
||||
BehaviorDiffScope.RETURN_VALUE: (
|
||||
"The function returned a different value in the"
|
||||
" optimized code compared to the original."
|
||||
),
|
||||
TestDiffScope.STDOUT: (
|
||||
BehaviorDiffScope.STDOUT: (
|
||||
"The output printed to stdout is different in the"
|
||||
" optimized code compared to the original."
|
||||
),
|
||||
TestDiffScope.DID_PASS: (
|
||||
BehaviorDiffScope.DID_PASS: (
|
||||
"The test passed in one version but failed in the"
|
||||
" other (a change in pass/fail behavior)."
|
||||
),
|
||||
|
|
@ -39,7 +39,7 @@ SCOPE_DESCRIPTIONS: dict[TestDiffScope, str] = {
|
|||
|
||||
|
||||
def build_test_details(
|
||||
test_diffs: list[TestDiff],
|
||||
test_diffs: list[BehaviorDiff],
|
||||
language: str = "python",
|
||||
) -> str:
|
||||
"""
|
||||
|
|
@ -59,7 +59,7 @@ def build_test_details(
|
|||
|
||||
|
||||
def _format_single_diff(
|
||||
diff: TestDiff,
|
||||
diff: BehaviorDiff,
|
||||
sections: defaultdict[str, list[str]],
|
||||
seen_headers: set[str],
|
||||
test_error_label: str,
|
||||
|
|
@ -92,7 +92,7 @@ def _format_single_diff(
|
|||
|
||||
scope_desc = SCOPE_DESCRIPTIONS.get(diff.scope, diff.scope.value)
|
||||
detail_lines = [f"- {scope_desc}"]
|
||||
if diff.scope != TestDiffScope.DID_PASS:
|
||||
if diff.scope != BehaviorDiffScope.DID_PASS:
|
||||
detail_lines.append(f" Expected: {diff.original_value!r}")
|
||||
detail_lines.append(f" Got: {diff.candidate_value!r}")
|
||||
else:
|
||||
|
|
@ -110,7 +110,7 @@ def build_user_prompt(
|
|||
template: str,
|
||||
original_source_code: str,
|
||||
modified_source_code: str,
|
||||
test_diffs: list[TestDiff],
|
||||
test_diffs: list[BehaviorDiff],
|
||||
language: str = "python",
|
||||
) -> str:
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from typing import Any
|
|||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class TestDiffScope(str, enum.Enum):
|
||||
class BehaviorDiffScope(str, enum.Enum):
|
||||
"""
|
||||
The dimension on which a test diff was observed.
|
||||
"""
|
||||
|
|
@ -17,12 +17,12 @@ class TestDiffScope(str, enum.Enum):
|
|||
TIMED_OUT = "timed_out"
|
||||
|
||||
|
||||
class TestDiff(BaseModel):
|
||||
class BehaviorDiff(BaseModel):
|
||||
"""
|
||||
A single behavioural difference between original and optimised code.
|
||||
"""
|
||||
|
||||
scope: TestDiffScope
|
||||
scope: BehaviorDiffScope
|
||||
original_value: (
|
||||
bool | str | int | float | dict[str, Any] | list[Any] | None
|
||||
) = None
|
||||
|
|
@ -45,7 +45,7 @@ class CodeRepairRequest(BaseModel):
|
|||
optimization_id: str
|
||||
original_source_code: str
|
||||
modified_source_code: str
|
||||
test_diffs: list[TestDiff]
|
||||
test_diffs: list[BehaviorDiff]
|
||||
language: str = "python"
|
||||
rerun_trace_id: str | None = None
|
||||
|
||||
|
|
|
|||
|
|
@ -12,8 +12,8 @@ from codeflash_api.repair._context import (
|
|||
)
|
||||
from codeflash_api.repair.schemas import (
|
||||
CodeRepairRequest,
|
||||
TestDiff,
|
||||
TestDiffScope,
|
||||
BehaviorDiff,
|
||||
BehaviorDiffScope,
|
||||
)
|
||||
|
||||
# -------------------------------------------------------------------
|
||||
|
|
@ -42,8 +42,8 @@ class TestCodeRepairRequest:
|
|||
"""
|
||||
A request with all fields deserializes.
|
||||
"""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value="1",
|
||||
candidate_value="2",
|
||||
original_pass=True,
|
||||
|
|
@ -60,18 +60,18 @@ class TestCodeRepairRequest:
|
|||
rerun_trace_id=str(uuid.uuid4()),
|
||||
)
|
||||
assert 1 == len(req.test_diffs)
|
||||
assert TestDiffScope.RETURN_VALUE == req.test_diffs[0].scope
|
||||
assert BehaviorDiffScope.RETURN_VALUE == req.test_diffs[0].scope
|
||||
|
||||
|
||||
class TestTestDiff:
|
||||
"""Tests for TestDiff schema."""
|
||||
class TestBehaviorDiff:
|
||||
"""Tests for BehaviorDiff schema."""
|
||||
|
||||
def test_all_scopes(self):
|
||||
"""
|
||||
Every scope enum value can be used.
|
||||
"""
|
||||
for scope in TestDiffScope:
|
||||
diff = TestDiff(
|
||||
for scope in BehaviorDiffScope:
|
||||
diff = BehaviorDiff(
|
||||
scope=scope,
|
||||
original_pass=True,
|
||||
candidate_pass=False,
|
||||
|
|
@ -82,8 +82,8 @@ class TestTestDiff:
|
|||
"""
|
||||
Optional fields default to None.
|
||||
"""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.DID_PASS,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.DID_PASS,
|
||||
original_pass=True,
|
||||
candidate_pass=False,
|
||||
)
|
||||
|
|
@ -112,8 +112,8 @@ class TestBuildTestDetails:
|
|||
"""
|
||||
Return value diffs show Expected/Got lines.
|
||||
"""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value="hello",
|
||||
candidate_value="world",
|
||||
original_pass=True,
|
||||
|
|
@ -129,8 +129,8 @@ class TestBuildTestDetails:
|
|||
"""
|
||||
DID_PASS diffs show pass/fail status.
|
||||
"""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.DID_PASS,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.DID_PASS,
|
||||
original_pass=True,
|
||||
candidate_pass=False,
|
||||
test_src_code="test_something",
|
||||
|
|
@ -143,8 +143,8 @@ class TestBuildTestDetails:
|
|||
"""
|
||||
Pytest errors appear in the output when present.
|
||||
"""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=1,
|
||||
candidate_value=2,
|
||||
original_pass=True,
|
||||
|
|
@ -161,16 +161,16 @@ class TestBuildTestDetails:
|
|||
Multiple diffs for the same test source share a header.
|
||||
"""
|
||||
diffs = [
|
||||
TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=1,
|
||||
candidate_value=2,
|
||||
original_pass=True,
|
||||
candidate_pass=False,
|
||||
test_src_code="test_func",
|
||||
),
|
||||
TestDiff(
|
||||
scope=TestDiffScope.STDOUT,
|
||||
BehaviorDiff(
|
||||
scope=BehaviorDiffScope.STDOUT,
|
||||
original_value="out",
|
||||
candidate_value="err",
|
||||
original_pass=True,
|
||||
|
|
@ -186,16 +186,16 @@ class TestBuildTestDetails:
|
|||
Different test sources get separate sections.
|
||||
"""
|
||||
diffs = [
|
||||
TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=1,
|
||||
candidate_value=2,
|
||||
original_pass=True,
|
||||
candidate_pass=False,
|
||||
test_src_code="test_a",
|
||||
),
|
||||
TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=3,
|
||||
candidate_value=4,
|
||||
original_pass=True,
|
||||
|
|
@ -211,8 +211,8 @@ class TestBuildTestDetails:
|
|||
"""
|
||||
Non-python language uses 'Test error' label.
|
||||
"""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=1,
|
||||
candidate_value=2,
|
||||
original_pass=True,
|
||||
|
|
@ -227,8 +227,8 @@ class TestBuildTestDetails:
|
|||
"""
|
||||
Missing test source shows 'Not available'.
|
||||
"""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=1,
|
||||
candidate_value=2,
|
||||
original_pass=True,
|
||||
|
|
@ -255,8 +255,8 @@ class TestBuildUserPrompt:
|
|||
"Modified: {modified_source_code}\n"
|
||||
"Tests: {test_details}"
|
||||
)
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=1,
|
||||
candidate_value=2,
|
||||
original_pass=True,
|
||||
|
|
|
|||
|
|
@ -3,17 +3,17 @@
|
|||
from ._baseline import establish_original_code_baseline
|
||||
from ._verification import compare_test_results
|
||||
from .models import (
|
||||
BehaviorDiff,
|
||||
BehaviorDiffScope,
|
||||
OptimizedCandidateResult,
|
||||
OriginalCodeBaseline,
|
||||
TestDiff,
|
||||
TestDiffScope,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"BehaviorDiff",
|
||||
"BehaviorDiffScope",
|
||||
"OptimizedCandidateResult",
|
||||
"OriginalCodeBaseline",
|
||||
"TestDiff",
|
||||
"TestDiffScope",
|
||||
"compare_test_results",
|
||||
"establish_original_code_baseline",
|
||||
]
|
||||
|
|
|
|||
|
|
@ -11,7 +11,7 @@ from typing import TYPE_CHECKING
|
|||
from .._model import VerificationType
|
||||
from ..test_discovery.models import TestType
|
||||
from ._comparator import comparator
|
||||
from .models import TestDiff, TestDiffScope
|
||||
from .models import BehaviorDiff, BehaviorDiffScope
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ..testing.models import TestResults
|
||||
|
|
@ -22,7 +22,7 @@ INCREASED_RECURSION_LIMIT = 5000
|
|||
|
||||
_reprlib_repr = reprlib.Repr()
|
||||
_reprlib_repr.maxstring = 1500
|
||||
_test_diff_repr = _reprlib_repr.repr
|
||||
_behavior_diff_repr = _reprlib_repr.repr
|
||||
|
||||
|
||||
def safe_repr(obj: object) -> str:
|
||||
|
|
@ -48,7 +48,7 @@ def compare_test_results( # noqa: C901, PLR0912
|
|||
original_results: TestResults,
|
||||
candidate_results: TestResults,
|
||||
pass_fail_only: bool = False, # noqa: FBT001, FBT002
|
||||
) -> tuple[bool, list[TestDiff]]:
|
||||
) -> tuple[bool, list[BehaviorDiff]]:
|
||||
"""Compare original and candidate test results for behavioral equivalence.
|
||||
|
||||
Returns a tuple of (all_match, diffs). When *pass_fail_only* is True,
|
||||
|
|
@ -66,7 +66,7 @@ def compare_test_results( # noqa: C901, PLR0912
|
|||
| candidate_results.get_all_unique_invocation_loop_ids()
|
||||
)
|
||||
|
||||
test_diffs: list[TestDiff] = []
|
||||
test_diffs: list[BehaviorDiff] = []
|
||||
did_all_timeout = True
|
||||
|
||||
for test_id in test_ids_superset:
|
||||
|
|
@ -143,8 +143,8 @@ def compare_test_results( # noqa: C901, PLR0912
|
|||
TestType.REPLAY_TEST,
|
||||
} and (cdd_test_result.did_pass != original_test_result.did_pass):
|
||||
test_diffs.append(
|
||||
TestDiff(
|
||||
scope=TestDiffScope.DID_PASS,
|
||||
BehaviorDiff(
|
||||
scope=BehaviorDiffScope.DID_PASS,
|
||||
original_value=str(original_test_result.did_pass),
|
||||
candidate_value=str(cdd_test_result.did_pass),
|
||||
test_src_code=(
|
||||
|
|
@ -164,14 +164,14 @@ def compare_test_results( # noqa: C901, PLR0912
|
|||
superset_obj=superset_obj,
|
||||
):
|
||||
test_diffs.append(
|
||||
TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
original_value=_test_diff_repr(
|
||||
BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_value=_behavior_diff_repr(
|
||||
safe_repr(
|
||||
original_test_result.return_value,
|
||||
),
|
||||
),
|
||||
candidate_value=_test_diff_repr(
|
||||
candidate_value=_behavior_diff_repr(
|
||||
safe_repr(
|
||||
cdd_test_result.return_value,
|
||||
),
|
||||
|
|
@ -214,8 +214,8 @@ def compare_test_results( # noqa: C901, PLR0912
|
|||
)
|
||||
):
|
||||
test_diffs.append(
|
||||
TestDiff(
|
||||
scope=TestDiffScope.STDOUT,
|
||||
BehaviorDiff(
|
||||
scope=BehaviorDiffScope.STDOUT,
|
||||
original_value=str(original_test_result.stdout),
|
||||
candidate_value=str(cdd_test_result.stdout),
|
||||
test_src_code=(
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ if TYPE_CHECKING:
|
|||
from ..testing.models import TestResults
|
||||
|
||||
|
||||
class TestDiffScope(str, enum.Enum):
|
||||
class BehaviorDiffScope(str, enum.Enum):
|
||||
"""Scope of a behavioral difference between original and candidate."""
|
||||
|
||||
RETURN_VALUE = "return_value"
|
||||
|
|
@ -23,10 +23,10 @@ class TestDiffScope(str, enum.Enum):
|
|||
|
||||
|
||||
@attrs.frozen
|
||||
class TestDiff:
|
||||
class BehaviorDiff:
|
||||
"""A single behavioral difference between original and candidate."""
|
||||
|
||||
scope: TestDiffScope
|
||||
scope: BehaviorDiffScope
|
||||
original_pass: bool
|
||||
candidate_pass: bool
|
||||
original_value: str | None = None
|
||||
|
|
|
|||
|
|
@ -501,16 +501,16 @@ class TestRunTestsAndBenchmark:
|
|||
) -> None:
|
||||
"""When behavioral tests fail with diffs, diffs are stored."""
|
||||
from codeflash_python.verification.models import (
|
||||
TestDiff,
|
||||
TestDiffScope,
|
||||
BehaviorDiff,
|
||||
BehaviorDiffScope,
|
||||
)
|
||||
|
||||
eval_ctx = EvaluationContext()
|
||||
failed_diffs: dict[str, list[Any]] = {}
|
||||
bench_results: dict[str, TestResults] = {}
|
||||
|
||||
diff_obj = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff_obj = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_pass=True,
|
||||
candidate_pass=True,
|
||||
original_value="42",
|
||||
|
|
|
|||
|
|
@ -18,8 +18,8 @@ from codeflash_python.verification._verification import (
|
|||
)
|
||||
from codeflash_python.verification.models import (
|
||||
OptimizedCandidateResult,
|
||||
TestDiff,
|
||||
TestDiffScope,
|
||||
BehaviorDiff,
|
||||
BehaviorDiffScope,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -125,7 +125,7 @@ class TestCompareTestResults:
|
|||
|
||||
assert match is False
|
||||
assert 1 == len(diffs)
|
||||
assert TestDiffScope.DID_PASS == diffs[0].scope
|
||||
assert BehaviorDiffScope.DID_PASS == diffs[0].scope
|
||||
assert diffs[0].original_pass is True
|
||||
assert diffs[0].candidate_pass is False
|
||||
|
||||
|
|
@ -142,7 +142,7 @@ class TestCompareTestResults:
|
|||
|
||||
assert match is False
|
||||
assert 1 == len(diffs)
|
||||
assert TestDiffScope.RETURN_VALUE == diffs[0].scope
|
||||
assert BehaviorDiffScope.RETURN_VALUE == diffs[0].scope
|
||||
|
||||
def test_stdout_mismatch(self) -> None:
|
||||
"""Same return values but different stdout produces STDOUT diff."""
|
||||
|
|
@ -165,7 +165,7 @@ class TestCompareTestResults:
|
|||
|
||||
assert match is False
|
||||
assert 1 == len(diffs)
|
||||
assert TestDiffScope.STDOUT == diffs[0].scope
|
||||
assert BehaviorDiffScope.STDOUT == diffs[0].scope
|
||||
|
||||
def test_pass_fail_only_skips_return_values(self) -> None:
|
||||
"""When pass_fail_only=True, return value diffs are ignored."""
|
||||
|
|
@ -265,7 +265,7 @@ class TestCompareTestResults:
|
|||
assert [] == diffs
|
||||
|
||||
def test_multiple_diffs_collected(self) -> None:
|
||||
"""Multiple mismatches produce multiple TestDiff entries."""
|
||||
"""Multiple mismatches produce multiple BehaviorDiff entries."""
|
||||
original = make_results(
|
||||
make_invocation(
|
||||
test_function="test_a",
|
||||
|
|
@ -296,8 +296,8 @@ class TestCompareTestResults:
|
|||
assert match is False
|
||||
assert 2 == len(diffs)
|
||||
scopes = {d.scope for d in diffs}
|
||||
assert TestDiffScope.DID_PASS in scopes
|
||||
assert TestDiffScope.RETURN_VALUE in scopes
|
||||
assert BehaviorDiffScope.DID_PASS in scopes
|
||||
assert BehaviorDiffScope.RETURN_VALUE in scopes
|
||||
|
||||
|
||||
class TestPerformanceGain:
|
||||
|
|
@ -348,23 +348,23 @@ class TestPerformanceGain:
|
|||
assert result < 0.01
|
||||
|
||||
|
||||
class TestTestDiffScope:
|
||||
"""TestDiffScope enum values."""
|
||||
class TestBehaviorDiffScope:
|
||||
"""BehaviorDiffScope enum values."""
|
||||
|
||||
def test_values(self) -> None:
|
||||
"""The three enum values exist with expected string values."""
|
||||
assert "return_value" == TestDiffScope.RETURN_VALUE.value
|
||||
assert "stdout" == TestDiffScope.STDOUT.value
|
||||
assert "did_pass" == TestDiffScope.DID_PASS.value
|
||||
assert "return_value" == BehaviorDiffScope.RETURN_VALUE.value
|
||||
assert "stdout" == BehaviorDiffScope.STDOUT.value
|
||||
assert "did_pass" == BehaviorDiffScope.DID_PASS.value
|
||||
|
||||
|
||||
class TestTestDiff:
|
||||
"""TestDiff frozen data class."""
|
||||
class TestBehaviorDiff:
|
||||
"""BehaviorDiff frozen data class."""
|
||||
|
||||
def test_construction(self) -> None:
|
||||
"""Can construct with all fields."""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.RETURN_VALUE,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.RETURN_VALUE,
|
||||
original_pass=True,
|
||||
candidate_pass=True,
|
||||
original_value="42",
|
||||
|
|
@ -374,7 +374,7 @@ class TestTestDiff:
|
|||
original_pytest_error=None,
|
||||
)
|
||||
|
||||
assert TestDiffScope.RETURN_VALUE == diff.scope
|
||||
assert BehaviorDiffScope.RETURN_VALUE == diff.scope
|
||||
assert diff.original_pass is True
|
||||
assert diff.candidate_pass is True
|
||||
assert "42" == diff.original_value
|
||||
|
|
@ -385,19 +385,19 @@ class TestTestDiff:
|
|||
|
||||
def test_frozen(self) -> None:
|
||||
"""Raises on attribute assignment."""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.DID_PASS,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.DID_PASS,
|
||||
original_pass=True,
|
||||
candidate_pass=False,
|
||||
)
|
||||
|
||||
with pytest.raises(attrs.exceptions.FrozenInstanceError):
|
||||
diff.scope = TestDiffScope.STDOUT # type: ignore[misc]
|
||||
diff.scope = BehaviorDiffScope.STDOUT # type: ignore[misc]
|
||||
|
||||
def test_default_none_fields(self) -> None:
|
||||
"""Optional fields default to None."""
|
||||
diff = TestDiff(
|
||||
scope=TestDiffScope.STDOUT,
|
||||
diff = BehaviorDiff(
|
||||
scope=BehaviorDiffScope.STDOUT,
|
||||
original_pass=True,
|
||||
candidate_pass=True,
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in a new issue