codeflash-agent/packages/codeflash-python/tests/test_instrumentation_run_results_aiservice.py

from __future__ import annotations

import importlib
import os
import sys
from pathlib import Path

from codeflash_python._model import (
    FunctionParent,
    FunctionToOptimize,
    TestingMode,
    VerificationType,
)
from codeflash_python.test_discovery.models import CodePosition, TestType
from codeflash_python.testing._instrument_async import write_async_helper_file
from codeflash_python.testing._instrument_capture import (
    instrument_codeflash_capture,
)
from codeflash_python.testing._instrument_sync import (
    add_sync_decorator_to_function,
)
from codeflash_python.testing._instrumentation import (
    inject_profiling_into_existing_test,
)
from codeflash_python.testing._parse_results import parse_test_results
from codeflash_python.testing._test_runner import run_behavioral_tests
from codeflash_python.testing.models import TestConfig, TestFile, TestFiles
from codeflash_python.verification._verification import compare_test_results

project_root = Path(__file__).parent.resolve()


def test_class_method_test_instrumentation_only() -> None:
    """Verifies instrumented test execution and result parsing without codeflash capture."""
    raw_test_code = """from code_to_optimize.bubble_sort_method import BubbleSorter


def test_single_element_list():
    obj = BubbleSorter()
    result = obj.sorter([42])
"""

    # Init paths
    test_path = (
        project_root
        / "code_to_optimize/tests/pytest/test_aiservice_behavior_results_temp.py"
    ).resolve()
    test_path_perf = (
        project_root
        / "code_to_optimize/tests/pytest/test_aiservice_behavior_results_perf_temp.py"
    ).resolve()
    tests_root = project_root / "code_to_optimize/tests/pytest/"
    project_root_path = project_root
    run_cwd = project_root
    old_cwd = os.getcwd()
    os.chdir(run_cwd)
    fto_path = (
        project_root / "code_to_optimize/bubble_sort_method.py"
    ).resolve()
    original_code = fto_path.read_text("utf-8")

    function_to_optimize = FunctionToOptimize(
        "sorter",
        fto_path,
        parents=(FunctionParent("BubbleSorter", "ClassDef"),),
    )

    try:
        # Write raw test, instrument it, then add decorator to source
        test_path.write_text(raw_test_code, encoding="utf-8")

        success, new_test = inject_profiling_into_existing_test(
            test_path,
            [CodePosition(6, 13)],
            function_to_optimize,
            project_root_path,
            mode=TestingMode.BEHAVIOR,
        )
        assert success
        assert new_test is not None
        test_path.write_text(new_test, encoding="utf-8")

        # Write the async helper file and add sync decorator to source
        write_async_helper_file(project_root_path)
        add_sync_decorator_to_function(
            fto_path,
            function_to_optimize,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )

        test_config = TestConfig(
            tests_root=tests_root,
            tests_project_rootdir=project_root_path,
            project_root_path=project_root_path,
            test_framework="pytest",
            pytest_cmd="pytest",
        )
        test_env = os.environ.copy()
        test_env["CODEFLASH_TEST_ITERATION"] = "0"
        test_env["CODEFLASH_LOOP_INDEX"] = "1"
        test_type = TestType.EXISTING_UNIT_TEST
        test_files = TestFiles(
            test_files=[
                TestFile(
                    instrumented_behavior_file_path=test_path,
                    test_type=test_type,
                    original_file_path=test_path,
                    benchmarking_file_path=test_path_perf,
                )
            ]
        )
        xml_path, run_result, _, _ = run_behavioral_tests(
            test_files=test_files,
            test_env=test_env,
            cwd=test_config.project_root_path,
            pytest_cmd=test_config.pytest_cmd,
        )
        test_results = parse_test_results(
            test_xml_path=xml_path,
            test_files=test_files,
            test_config=test_config,
            optimization_iteration=0,
            run_result=run_result,
        )
        assert test_results[0].id.function_getting_tested == "sorter"
        assert (
            test_results[0].stdout
            == "codeflash stdout : BubbleSorter.sorter() called\n"
        )
        assert (
            test_results[0].id.test_function_name == "test_single_element_list"
        )
        assert test_results[0].did_pass
        # return_value is ((args, kwargs, return_value),) in the new path
        assert test_results[0].return_value[0][2] == [42]

        # Replace with optimized code that mutated instance attribute
        optimized_code_mutated_attr = """
import sys


class BubbleSorter:

    def __init__(self, x=1):
        self.x = x

    def sorter(self, arr):
        print("BubbleSorter.sorter() called")
        for i in range(len(arr)):
            for j in range(len(arr) - 1):
                if arr[j] > arr[j + 1]:
                    temp = arr[j]
                    arr[j] = arr[j + 1]
                    arr[j + 1] = temp
        print("stderr test", file=sys.stderr)
        return arr
                        """
        fto_path.write_text(optimized_code_mutated_attr, "utf-8")

        # Re-add sync decorator to the new source
        add_sync_decorator_to_function(
            fto_path,
            function_to_optimize,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )

        xml_path, run_result, _, _ = run_behavioral_tests(
            test_files=test_files,
            test_env=test_env,
            cwd=test_config.project_root_path,
            pytest_cmd=test_config.pytest_cmd,
        )
        test_results_mutated_attr = parse_test_results(
            test_xml_path=xml_path,
            test_files=test_files,
            test_config=test_config,
            optimization_iteration=0,
            run_result=run_result,
        )
        # In the new decorator-based path, args (including self) are captured,
        # so init state changes ARE detected even without explicit codeflash_capture
        match, _ = compare_test_results(
            test_results, test_results_mutated_attr
        )
        assert not match
        assert (
            test_results_mutated_attr[0].stdout
            == "BubbleSorter.sorter() called\n"
        )
    finally:
        fto_path.write_text(original_code, "utf-8")
        test_path.unlink(missing_ok=True)
        test_path_perf.unlink(missing_ok=True)
        (project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)
        os.chdir(old_cwd)


def test_class_method_full_instrumentation() -> None:
    """Verifies full instrumentation with codeflash capture for instance state verification."""
    raw_test_code = """from code_to_optimize.bubble_sort_method import BubbleSorter


def test_single_element_list():
    obj = BubbleSorter()
    result = obj.sorter([3, 2, 1])
"""

    # Init paths
    test_path = (
        project_root
        / "code_to_optimize/tests/pytest/test_aiservice_behavior_results_temp.py"
    ).resolve()
    test_path_perf = (
        project_root
        / "code_to_optimize/tests/pytest/test_aiservice_behavior_results_perf_temp.py"
    ).resolve()
    tests_root = project_root / "code_to_optimize/tests/pytest/"
    project_root_path = project_root

    fto_path = (
        project_root / "code_to_optimize/bubble_sort_method.py"
    ).resolve()
    original_code = fto_path.read_text("utf-8")
    function_to_optimize = FunctionToOptimize(
        "sorter",
        fto_path,
        parents=(FunctionParent("BubbleSorter", "ClassDef"),),
    )

    try:
        # Write raw test, instrument it, then add decorator to source
        test_path.write_text(raw_test_code, encoding="utf-8")

        original_cwd = Path.cwd()
        os.chdir(project_root_path)
        success, new_test = inject_profiling_into_existing_test(
            test_path,
            [CodePosition(6, 13)],
            function_to_optimize,
            project_root_path,
            mode=TestingMode.BEHAVIOR,
        )
        os.chdir(original_cwd)
        assert success
        assert new_test is not None
        test_path.write_text(new_test, encoding="utf-8")

        # Write the async helper file and add sync decorator to source
        write_async_helper_file(project_root_path)
        add_sync_decorator_to_function(
            fto_path,
            function_to_optimize,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )

        # Add codeflash capture decorator for __init__ state tracking
        instrument_codeflash_capture(function_to_optimize, {}, tests_root)

        test_config = TestConfig(
            tests_root=tests_root,
            tests_project_rootdir=project_root_path,
            project_root_path=project_root_path,
            test_framework="pytest",
            pytest_cmd="pytest",
        )
        test_env = os.environ.copy()
        test_env["CODEFLASH_TEST_ITERATION"] = "0"
        test_env["CODEFLASH_LOOP_INDEX"] = "1"
        test_type = TestType.EXISTING_UNIT_TEST
        test_files = TestFiles(
            test_files=[
                TestFile(
                    instrumented_behavior_file_path=test_path,
                    test_type=test_type,
                    original_file_path=test_path,
                    benchmarking_file_path=test_path_perf,
                )
            ]
        )
        xml_path, run_result, _, _ = run_behavioral_tests(
            test_files=test_files,
            test_env=test_env,
            cwd=test_config.project_root_path,
            pytest_cmd=test_config.pytest_cmd,
        )
        test_results = parse_test_results(
            test_xml_path=xml_path,
            test_files=test_files,
            test_config=test_config,
            optimization_iteration=0,
            run_result=run_result,
        )
        # Verify instance_state result (from codeflash_capture)
        assert (
            test_results[0].id.function_getting_tested
            == "BubbleSorter.__init__"
        )
        assert (
            test_results[0].id.test_function_name == "test_single_element_list"
        )
        assert test_results[0].did_pass
        assert test_results[0].return_value[0] == {"x": 0}
        assert test_results[0].stdout == ""

        # Verify function_to_optimize result (from sync decorator)
        assert test_results[1].id.function_getting_tested == "sorter"
        assert (
            test_results[1].id.test_function_name == "test_single_element_list"
        )
        assert test_results[1].did_pass
        # return_value is ((args, kwargs, return_value),) in the new path
        assert test_results[1].return_value[0][2] == [1, 2, 3]
        assert (
            test_results[1].stdout
            == "codeflash stdout : BubbleSorter.sorter() called\n"
        )

        # Replace with optimized code that mutated instance attribute
        optimized_code_mutated_attr = """
import sys


class BubbleSorter:

    def __init__(self, x=1):
        self.x = x

    def sorter(self, arr):
        print("BubbleSorter.sorter() called")
        for i in range(len(arr)):
            for j in range(len(arr) - 1):
                if arr[j] > arr[j + 1]:
                    temp = arr[j]
                    arr[j] = arr[j + 1]
                    arr[j + 1] = temp
        print("stderr test", file=sys.stderr)
        return arr
                        """
        fto_path.write_text(optimized_code_mutated_attr, "utf-8")
        # Force reload of module
        module_name = "code_to_optimize.bubble_sort_method"
        if module_name not in sys.modules:
            __import__(module_name)
        importlib.reload(sys.modules[module_name])

        # Re-add sync decorator and codeflash capture to the new source
        add_sync_decorator_to_function(
            fto_path,
            function_to_optimize,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )
        instrument_codeflash_capture(function_to_optimize, {}, tests_root)
        xml_path, run_result, _, _ = run_behavioral_tests(
            test_files=test_files,
            test_env=test_env,
            cwd=test_config.project_root_path,
            pytest_cmd=test_config.pytest_cmd,
        )
        test_results_mutated_attr = parse_test_results(
            test_xml_path=xml_path,
            test_files=test_files,
            test_config=test_config,
            optimization_iteration=0,
            run_result=run_result,
        )
        assert (
            test_results_mutated_attr[0].id.function_getting_tested
            == "BubbleSorter.__init__"
        )
        assert test_results_mutated_attr[0].return_value[0] == {"x": 1}
        assert (
            test_results_mutated_attr[0].verification_type
            == VerificationType.INIT_STATE_FTO
        )
        assert test_results_mutated_attr[0].stdout == ""
        # The test should fail because the instance attribute was mutated
        match, _ = compare_test_results(
            test_results, test_results_mutated_attr
        )
        assert not match

        # Replace with optimized code that did not mutate existing
        # instance attribute, but added a new one
        optimized_code_new_attr = """
import sys


class BubbleSorter:
    def __init__(self, x=0):
        self.x = x
        self.y = 2

    def sorter(self, arr):
        print("BubbleSorter.sorter() called")
        for i in range(len(arr)):
            for j in range(len(arr) - 1):
                if arr[j] > arr[j + 1]:
                    temp = arr[j]
                    arr[j] = arr[j + 1]
                    arr[j + 1] = temp
        print("stderr test", file=sys.stderr)
        return arr
                        """
        fto_path.write_text(optimized_code_new_attr, "utf-8")
        importlib.reload(sys.modules[module_name])

        # Re-add sync decorator and codeflash capture
        add_sync_decorator_to_function(
            fto_path,
            function_to_optimize,
            mode=TestingMode.BEHAVIOR,
            project_root=project_root_path,
        )
        instrument_codeflash_capture(function_to_optimize, {}, tests_root)
        xml_path, run_result, _, _ = run_behavioral_tests(
            test_files=test_files,
            test_env=test_env,
            cwd=test_config.project_root_path,
            pytest_cmd=test_config.pytest_cmd,
        )
        test_results_new_attr = parse_test_results(
            test_xml_path=xml_path,
            test_files=test_files,
            test_config=test_config,
            optimization_iteration=0,
            run_result=run_result,
        )
        assert (
            test_results_new_attr[0].id.function_getting_tested
            == "BubbleSorter.__init__"
        )
        assert test_results_new_attr[0].return_value[0] == {"x": 0, "y": 2}
        assert (
            test_results_new_attr[0].verification_type
            == VerificationType.INIT_STATE_FTO
        )
        assert test_results_new_attr[0].stdout == ""
        # In the new decorator-based path, args (including self) are captured.
        # Adding a new instance attribute changes self, so the comparison
        # detects a difference even though codeflash_capture considers it additive.
        match, _ = compare_test_results(test_results, test_results_new_attr)
        assert not match
    finally:
        fto_path.write_text(original_code, "utf-8")
        test_path.unlink(missing_ok=True)
        test_path_perf.unlink(missing_ok=True)
        (project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)