codeflash-agent/packages/codeflash-python/tests/test_instrument_all_and_run.py

from __future__ import annotations

import importlib
import os
import sys
import tempfile
from pathlib import Path

from codeflash_python._model import (
    FunctionParent,
    FunctionToOptimize,
    TestingMode,
)
from codeflash_python.test_discovery.models import CodePosition, TestType
from codeflash_python.testing._instrument_async import write_async_helper_file
from codeflash_python.testing._instrument_capture import (
    instrument_codeflash_capture,
)
from codeflash_python.testing._instrument_sync import (
    add_sync_decorator_to_function,
)
from codeflash_python.testing._instrumentation import (
    inject_profiling_into_existing_test,
)
from codeflash_python.testing._parse_results import parse_test_results
from codeflash_python.testing._test_runner import run_behavioral_tests
from codeflash_python.testing.models import TestConfig, TestFile, TestFiles
from codeflash_python.verification._verification import compare_test_results

project_root = Path(__file__).parent.resolve()


def _run_and_parse(
    test_files: TestFiles,
    test_env: dict[str, str],
    test_config: TestConfig,
) -> list[object]:
    """Run behavioral tests and parse results (replaces Optimizer.run_and_parse_tests)."""
    xml_path, run_result, _, _ = run_behavioral_tests(
        test_files=test_files,
        test_env=test_env,
        cwd=test_config.project_root_path,
        pytest_cmd=test_config.pytest_cmd,
    )
    return parse_test_results(
        test_xml_path=xml_path,
        test_files=test_files,
        test_config=test_config,
        optimization_iteration=0,
        run_result=run_result,
    )


def test_bubble_sort_behavior_results() -> None:
    code = """from code_to_optimize.bubble_sort import sorter


def test_sort():
    input = [5, 4, 3, 2, 1, 0]
    output = sorter(input)
    assert output == [0, 1, 2, 3, 4, 5]

    input = [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]
    output = sorter(input)
    assert output == [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""

    test_path = (
        project_root
        / "code_to_optimize/tests/pytest/test_perfinjector_bubble_sort_results_temp.py"
    ).resolve()
    test_path_perf = (
        project_root
        / "code_to_optimize/tests/pytest/test_perfinjector_bubble_sort_results_perf_temp.py"
    ).resolve()
    fto_path = (project_root / "code_to_optimize/bubble_sort.py").resolve()
    original_code = fto_path.read_text("utf-8")
    try:
        with test_path.open("w") as f:
            f.write(code)

        tests_root = (
            project_root / "code_to_optimize/tests/pytest/"
        ).resolve()
        project_root_path = project_root
        original_cwd = Path.cwd()
        run_cwd = project_root
        func = FunctionToOptimize(
            function_name="sorter",
            parents=(),
            file_path=Path(fto_path),
        )
        os.chdir(run_cwd)
        success, new_test = inject_profiling_into_existing_test(
            test_path,
            [CodePosition(6, 13), CodePosition(10, 13)],
            func,
            project_root_path,
            mode=TestingMode.BEHAVIOR,
        )
        os.chdir(original_cwd)
        assert success
        assert new_test is not None

        with test_path.open("w") as f:
            f.write(new_test)

        # Write the async helper file (contains sync decorators too)
        write_async_helper_file(project_root_path)

        # Add sync decorator to the source function
        add_sync_decorator_to_function(
            fto_path,
            func,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )

        test_env = os.environ.copy()
        test_env["CODEFLASH_TEST_ITERATION"] = "0"
        test_env["CODEFLASH_LOOP_INDEX"] = "1"
        test_type = TestType.EXISTING_UNIT_TEST

        test_config = TestConfig(
            tests_root=tests_root,
            tests_project_rootdir=project_root_path,
            project_root_path=project_root_path,
            test_framework="pytest",
            pytest_cmd="pytest",
        )
        test_files = TestFiles(
            test_files=[
                TestFile(
                    instrumented_behavior_file_path=test_path,
                    test_type=test_type,
                    original_file_path=test_path,
                    benchmarking_file_path=test_path_perf,
                )
            ]
        )
        test_results = _run_and_parse(test_files, test_env, test_config)

        # New decorator captures stdout directly -- the function prints two lines
        assert test_results[0].id.function_getting_tested == "sorter"
        assert test_results[0].id.test_class_name is None
        assert test_results[0].id.test_function_name == "test_sort"
        assert (
            test_results[0].id.test_module_path
            == "code_to_optimize.tests.pytest.test_perfinjector_bubble_sort_results_temp"
        )
        assert test_results[0].runtime > 0
        assert test_results[0].did_pass
        # return_value is ((args, kwargs, return_value),) in the new path
        assert test_results[0].return_value[0][2] == [0, 1, 2, 3, 4, 5]
        out_str = (
            "codeflash stdout: Sorting list\nresult: [0, 1, 2, 3, 4, 5]\n"
        )
        assert test_results[0].stdout == out_str

        assert test_results[1].id.function_getting_tested == "sorter"
        assert test_results[1].id.test_class_name is None
        assert test_results[1].id.test_function_name == "test_sort"
        assert (
            test_results[1].id.test_module_path
            == "code_to_optimize.tests.pytest.test_perfinjector_bubble_sort_results_temp"
        )
        assert test_results[1].runtime > 0
        assert test_results[1].did_pass

        results2 = _run_and_parse(test_files, test_env, test_config)
        match, _ = compare_test_results(test_results, results2)
        assert match
    finally:
        fto_path.write_text(original_code, "utf-8")
        test_path.unlink(missing_ok=True)
        test_path_perf.unlink(missing_ok=True)
        (project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)


def test_method_full_instrumentation() -> None:
    code = """from code_to_optimize.bubble_sort_method import BubbleSorter


def test_sort():
    input = [5, 4, 3, 2, 1, 0]
    sort_class = BubbleSorter()
    output = sort_class.sorter(input)
    assert output == [0, 1, 2, 3, 4, 5]

    input = [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]
    sort_class = BubbleSorter()
    output = sort_class.sorter(input)
    assert output == [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""

    fto_path = (
        project_root / "code_to_optimize/bubble_sort_method.py"
    ).resolve()
    original_code = fto_path.read_text("utf-8")
    fto = FunctionToOptimize(
        function_name="sorter",
        parents=(FunctionParent(name="BubbleSorter", type="ClassDef"),),
        file_path=Path(fto_path),
    )
    tests_root = (project_root / "code_to_optimize/tests/pytest/").resolve()
    test_path = tests_root / "test_class_method_behavior_results_temp.py"
    test_path_perf = (
        tests_root / "test_class_method_behavior_results_perf_temp.py"
    )
    project_root_path = project_root

    try:
        # Write and instrument the test file
        test_path.write_text(code, encoding="utf-8")
        original_cwd = Path.cwd()
        os.chdir(project_root_path)
        success, new_test = inject_profiling_into_existing_test(
            test_path,
            [CodePosition(7, 13), CodePosition(12, 13)],
            fto,
            project_root_path,
        )
        os.chdir(original_cwd)
        assert success
        assert new_test is not None
        test_path.write_text(new_test, encoding="utf-8")

        # Write the async helper file and add sync decorator to source
        write_async_helper_file(project_root_path)
        add_sync_decorator_to_function(
            fto_path,
            fto,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )

        # Add codeflash capture for __init__ state
        instrument_codeflash_capture(fto, {}, tests_root)

        test_env = os.environ.copy()
        test_env["CODEFLASH_TEST_ITERATION"] = "0"
        test_env["CODEFLASH_LOOP_INDEX"] = "1"
        test_type = TestType.EXISTING_UNIT_TEST
        test_config = TestConfig(
            tests_root=tests_root,
            tests_project_rootdir=project_root_path,
            project_root_path=project_root_path,
            test_framework="pytest",
            pytest_cmd="pytest",
        )
        test_files = TestFiles(
            test_files=[
                TestFile(
                    instrumented_behavior_file_path=test_path,
                    test_type=test_type,
                    original_file_path=test_path,
                    benchmarking_file_path=test_path_perf,
                )
            ]
        )
        test_results = _run_and_parse(test_files, test_env, test_config)
        assert len(test_results) == 4
        # Order: init results (from codeflash_capture) then sorter results (from sync decorator)
        assert (
            test_results[0].id.function_getting_tested
            == "BubbleSorter.__init__"
        )
        assert test_results[0].id.test_function_name == "test_sort"
        assert test_results[0].did_pass
        assert test_results[0].return_value[0] == {"x": 0}
        assert (
            test_results[1].id.function_getting_tested
            == "BubbleSorter.__init__"
        )
        assert test_results[1].id.test_function_name == "test_sort"
        assert test_results[1].did_pass
        assert test_results[1].return_value[0] == {"x": 0}

        assert test_results[2].id.function_getting_tested == "sorter"
        assert test_results[2].id.test_class_name is None
        assert test_results[2].id.test_function_name == "test_sort"
        assert (
            test_results[2].id.test_module_path
            == "code_to_optimize.tests.pytest.test_class_method_behavior_results_temp"
        )
        assert test_results[2].runtime > 0
        assert test_results[2].did_pass
        # return_value is ((args, kwargs, return_value),) in the new path
        assert test_results[2].return_value[0][2] == [0, 1, 2, 3, 4, 5]
        assert (
            test_results[2].stdout
            == "codeflash stdout : BubbleSorter.sorter() called\n"
        )
        match, _ = compare_test_results(test_results, test_results)
        assert match

        assert test_results[3].id.function_getting_tested == "sorter"
        assert test_results[3].id.test_class_name is None
        assert test_results[3].id.test_function_name == "test_sort"
        assert (
            test_results[3].id.test_module_path
            == "code_to_optimize.tests.pytest.test_class_method_behavior_results_temp"
        )
        assert test_results[3].runtime > 0
        assert test_results[3].did_pass
        assert (
            test_results[3].stdout
            == "codeflash stdout : BubbleSorter.sorter() called\n"
        )

        results2 = _run_and_parse(test_files, test_env, test_config)

        match, _ = compare_test_results(test_results, results2)
        assert match

        # Replace with optimized code that mutated instance attribute
        optimized_code = """
class BubbleSorter:
    def __init__(self, x=1):
        self.x = x

    def sorter(self, arr):
        for i in range(len(arr)):
            for j in range(len(arr) - 1):
                if arr[j] > arr[j + 1]:
                    temp = arr[j]
                    arr[j] = arr[j + 1]
                    arr[j + 1] = temp
        return arr

        """

        fto_path.write_text(optimized_code, "utf-8")

        # Force reload of module
        module_name = "code_to_optimize.bubble_sort_method"
        if module_name not in sys.modules:
            __import__(module_name)
        importlib.reload(sys.modules[module_name])

        # Re-add sync decorator and codeflash capture to the new source
        add_sync_decorator_to_function(
            fto_path,
            fto,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )
        instrument_codeflash_capture(fto, {}, tests_root)
        test_config = TestConfig(
            tests_root=tests_root,
            tests_project_rootdir=project_root_path,
            project_root_path=project_root_path,
            test_framework="pytest",
            pytest_cmd="pytest",
        )
        test_files = TestFiles(
            test_files=[
                TestFile(
                    instrumented_behavior_file_path=test_path,
                    test_type=test_type,
                    original_file_path=test_path,
                    benchmarking_file_path=test_path_perf,
                )
            ]
        )
        new_test_results = _run_and_parse(test_files, test_env, test_config)
        assert len(new_test_results) == 4
        # Order: init results then sorter results
        assert (
            new_test_results[0].id.function_getting_tested
            == "BubbleSorter.__init__"
        )
        assert new_test_results[0].id.test_function_name == "test_sort"
        assert new_test_results[0].did_pass
        assert new_test_results[0].return_value[0] == {"x": 1}

        assert (
            new_test_results[1].id.function_getting_tested
            == "BubbleSorter.__init__"
        )
        assert new_test_results[1].id.test_function_name == "test_sort"
        assert new_test_results[1].did_pass
        assert new_test_results[1].return_value[0] == {"x": 1}

        assert new_test_results[2].id.function_getting_tested == "sorter"
        assert new_test_results[2].id.test_class_name is None
        assert new_test_results[2].id.test_function_name == "test_sort"
        assert (
            new_test_results[2].id.test_module_path
            == "code_to_optimize.tests.pytest.test_class_method_behavior_results_temp"
        )
        assert new_test_results[2].runtime > 0
        assert new_test_results[2].did_pass
        assert new_test_results[2].return_value[0][2] == [0, 1, 2, 3, 4, 5]

        assert new_test_results[3].id.function_getting_tested == "sorter"
        assert new_test_results[3].id.test_class_name is None
        assert new_test_results[3].id.test_function_name == "test_sort"
        assert (
            new_test_results[3].id.test_module_path
            == "code_to_optimize.tests.pytest.test_class_method_behavior_results_temp"
        )
        assert new_test_results[3].runtime > 0
        assert new_test_results[3].did_pass
        match, _ = compare_test_results(test_results, new_test_results)
        assert not match

    finally:
        fto_path.write_text(original_code, "utf-8")
        test_path.unlink(missing_ok=True)
        test_path_perf.unlink(missing_ok=True)
        (project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)


def test_classmethod_full_instrumentation() -> None:
    code = """from code_to_optimize.bubble_sort_method import BubbleSorter


def test_sort():
    input = [5, 4, 3, 2, 1, 0]
    output = BubbleSorter.sorter_classmethod(input)
    assert output == [0, 1, 2, 3, 4, 5]

    input = [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]
    output = BubbleSorter.sorter_classmethod(input)
    assert output == [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""

    fto_path = (
        project_root / "code_to_optimize/bubble_sort_method.py"
    ).resolve()
    original_code = fto_path.read_text("utf-8")
    fto = FunctionToOptimize(
        function_name="sorter_classmethod",
        parents=(FunctionParent(name="BubbleSorter", type="ClassDef"),),
        file_path=Path(fto_path),
    )
    tests_root = (project_root / "code_to_optimize/tests/pytest/").resolve()
    test_path = tests_root / "test_classmethod_behavior_results_temp.py"
    test_path_perf = (
        tests_root / "test_classmethod_behavior_results_perf_temp.py"
    )
    project_root_path = project_root

    try:
        # Write and instrument the test file
        test_path.write_text(code, encoding="utf-8")
        original_cwd = Path.cwd()
        os.chdir(project_root_path)
        success, new_test = inject_profiling_into_existing_test(
            test_path,
            [CodePosition(6, 13), CodePosition(10, 13)],
            fto,
            project_root_path,
        )
        os.chdir(original_cwd)
        assert success
        assert new_test is not None
        test_path.write_text(new_test, encoding="utf-8")

        # Write the async helper file and add sync decorator to source
        write_async_helper_file(project_root_path)
        add_sync_decorator_to_function(
            fto_path,
            fto,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )

        # Add codeflash capture
        instrument_codeflash_capture(fto, {}, tests_root)

        test_env = os.environ.copy()
        test_env["CODEFLASH_TEST_ITERATION"] = "0"
        test_env["CODEFLASH_LOOP_INDEX"] = "1"
        test_type = TestType.EXISTING_UNIT_TEST
        test_config = TestConfig(
            tests_root=tests_root,
            tests_project_rootdir=project_root_path,
            project_root_path=project_root_path,
            test_framework="pytest",
            pytest_cmd="pytest",
        )
        test_files = TestFiles(
            test_files=[
                TestFile(
                    instrumented_behavior_file_path=test_path,
                    test_type=test_type,
                    original_file_path=test_path,
                    benchmarking_file_path=test_path_perf,
                )
            ]
        )
        test_results = _run_and_parse(test_files, test_env, test_config)
        assert len(test_results) == 2
        assert (
            test_results[0].id.function_getting_tested == "sorter_classmethod"
        )
        assert test_results[0].id.test_class_name is None
        assert test_results[0].id.test_function_name == "test_sort"
        assert (
            test_results[0].id.test_module_path
            == "code_to_optimize.tests.pytest.test_classmethod_behavior_results_temp"
        )
        assert test_results[0].runtime > 0
        assert test_results[0].did_pass
        assert test_results[0].return_value[0][2] == [0, 1, 2, 3, 4, 5]
        assert (
            test_results[0].stdout
            == "codeflash stdout : BubbleSorter.sorter_classmethod() called\n"
        )
        match, _ = compare_test_results(test_results, test_results)
        assert match

        assert (
            test_results[1].id.function_getting_tested == "sorter_classmethod"
        )
        assert test_results[1].id.test_class_name is None
        assert test_results[1].id.test_function_name == "test_sort"
        assert (
            test_results[1].id.test_module_path
            == "code_to_optimize.tests.pytest.test_classmethod_behavior_results_temp"
        )
        assert test_results[1].runtime > 0
        assert test_results[1].did_pass
        assert (
            test_results[1].stdout
            == "codeflash stdout : BubbleSorter.sorter_classmethod() called\n"
        )

        results2 = _run_and_parse(test_files, test_env, test_config)

        match, _ = compare_test_results(test_results, results2)
        assert match

    finally:
        fto_path.write_text(original_code, "utf-8")
        test_path.unlink(missing_ok=True)
        test_path_perf.unlink(missing_ok=True)
        (project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)


def test_staticmethod_full_instrumentation() -> None:
    code = """from code_to_optimize.bubble_sort_method import BubbleSorter


def test_sort():
    input = [5, 4, 3, 2, 1, 0]
    output = BubbleSorter.sorter_staticmethod(input)
    assert output == [0, 1, 2, 3, 4, 5]

    input = [5.0, 4.0, 3.0, 2.0, 1.0, 0.0]
    output = BubbleSorter.sorter_staticmethod(input)
    assert output == [0.0, 1.0, 2.0, 3.0, 4.0, 5.0]"""

    fto_path = (
        project_root / "code_to_optimize/bubble_sort_method.py"
    ).resolve()
    original_code = fto_path.read_text("utf-8")
    fto = FunctionToOptimize(
        function_name="sorter_staticmethod",
        parents=(FunctionParent(name="BubbleSorter", type="ClassDef"),),
        file_path=Path(fto_path),
    )
    tests_root = (project_root / "code_to_optimize/tests/pytest/").resolve()
    test_path = tests_root / "test_staticmethod_behavior_results_temp.py"
    test_path_perf = (
        tests_root / "test_staticmethod_behavior_results_perf_temp.py"
    )
    project_root_path = project_root

    try:
        # Write and instrument the test file
        test_path.write_text(code, encoding="utf-8")
        original_cwd = Path.cwd()
        os.chdir(project_root_path)
        success, new_test = inject_profiling_into_existing_test(
            test_path,
            [CodePosition(6, 13), CodePosition(10, 13)],
            fto,
            project_root_path,
        )
        os.chdir(original_cwd)
        assert success
        assert new_test is not None
        test_path.write_text(new_test, encoding="utf-8")

        # Write the async helper file and add sync decorator to source
        write_async_helper_file(project_root_path)
        add_sync_decorator_to_function(
            fto_path,
            fto,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )

        # Add codeflash capture
        instrument_codeflash_capture(fto, {}, tests_root)

        test_env = os.environ.copy()
        test_env["CODEFLASH_TEST_ITERATION"] = "0"
        test_env["CODEFLASH_LOOP_INDEX"] = "1"
        test_type = TestType.EXISTING_UNIT_TEST
        test_config = TestConfig(
            tests_root=tests_root,
            tests_project_rootdir=project_root_path,
            project_root_path=project_root_path,
            test_framework="pytest",
            pytest_cmd="pytest",
        )
        test_files = TestFiles(
            test_files=[
                TestFile(
                    instrumented_behavior_file_path=test_path,
                    test_type=test_type,
                    original_file_path=test_path,
                    benchmarking_file_path=test_path_perf,
                )
            ]
        )
        test_results = _run_and_parse(test_files, test_env, test_config)
        assert len(test_results) == 2
        assert (
            test_results[0].id.function_getting_tested == "sorter_staticmethod"
        )
        assert test_results[0].id.test_class_name is None
        assert test_results[0].id.test_function_name == "test_sort"
        assert (
            test_results[0].id.test_module_path
            == "code_to_optimize.tests.pytest.test_staticmethod_behavior_results_temp"
        )
        assert test_results[0].runtime > 0
        assert test_results[0].did_pass
        assert test_results[0].return_value[0][2] == [0, 1, 2, 3, 4, 5]
        assert (
            test_results[0].stdout
            == "codeflash stdout : BubbleSorter.sorter_staticmethod() called\n"
        )
        match, _ = compare_test_results(test_results, test_results)
        assert match

        assert (
            test_results[1].id.function_getting_tested == "sorter_staticmethod"
        )
        assert test_results[1].id.test_class_name is None
        assert test_results[1].id.test_function_name == "test_sort"
        assert (
            test_results[1].id.test_module_path
            == "code_to_optimize.tests.pytest.test_staticmethod_behavior_results_temp"
        )
        assert test_results[1].runtime > 0
        assert test_results[1].did_pass
        assert (
            test_results[1].stdout
            == "codeflash stdout : BubbleSorter.sorter_staticmethod() called\n"
        )

        results2 = _run_and_parse(test_files, test_env, test_config)

        match, _ = compare_test_results(test_results, results2)
        assert match

    finally:
        fto_path.write_text(original_code, "utf-8")
        test_path.unlink(missing_ok=True)
        test_path_perf.unlink(missing_ok=True)
        (project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)