codeflash-agent/packages/codeflash-python/tests/test_instrumentation_run_results_aiservice.py
Kevin Turcios 919a673be2
Fix pre-existing CI lint and test failures (#40)
* chore: add gitignore entries for local eval repos, e2e fixtures, and env files

* fix: restore clean bubble_sort_method.py test fixture

The call-site ID commit re-contaminated this file with instrumentation
decorators, causing tests to fail with missing CODEFLASH_LOOP_INDEX.

* fix: resolve ruff and mypy errors in codeflash-python

- Add import-not-found ignores for optional torch/jax imports
- Extract magic column index to _STDOUT_COLUMN_INDEX constant
- Fix unused variable in _instrument_sync.py
- Cast cpu_time_ns to int for mypy arg-type

* fix: add skip markers for optional deps and apply ruff formatting to tests

Skip torch/jax/tensorflow tests when those packages are not installed.
Move has_module helper to conftest.py for reuse across test files.
Apply ruff format to all test files that drifted.

* fix: resolve remaining ruff format and mypy errors

- Add missing blank line in conftest.py (ruff format)
- Remove unused import-untyped ignore on jax import (mypy unused-ignore)
- Add type: ignore comments for object-typed SQLite row values

* chore: bump codeflash-python to 0.1.1.dev0
2026-04-28 18:39:46 -05:00

451 lines
15 KiB
Python

from __future__ import annotations
import importlib
import os
import sys
from pathlib import Path
from codeflash_python._model import (
FunctionParent,
FunctionToOptimize,
TestingMode,
VerificationType,
)
from codeflash_python.test_discovery.models import CodePosition, TestType
from codeflash_python.testing._instrument_async import write_async_helper_file
from codeflash_python.testing._instrument_capture import (
instrument_codeflash_capture,
)
from codeflash_python.testing._instrument_sync import (
add_sync_decorator_to_function,
)
from codeflash_python.testing._instrumentation import (
inject_profiling_into_existing_test,
)
from codeflash_python.testing._parse_results import parse_test_results
from codeflash_python.testing._test_runner import run_behavioral_tests
from codeflash_python.testing.models import TestConfig, TestFile, TestFiles
from codeflash_python.verification._verification import compare_test_results
project_root = Path(__file__).parent.resolve()
def test_class_method_test_instrumentation_only() -> None:
"""Verifies instrumented test execution and result parsing without codeflash capture."""
raw_test_code = """from code_to_optimize.bubble_sort_method import BubbleSorter
def test_single_element_list():
obj = BubbleSorter()
result = obj.sorter([42])
"""
# Init paths
test_path = (
project_root
/ "code_to_optimize/tests/pytest/test_aiservice_behavior_results_temp.py"
).resolve()
test_path_perf = (
project_root
/ "code_to_optimize/tests/pytest/test_aiservice_behavior_results_perf_temp.py"
).resolve()
tests_root = project_root / "code_to_optimize/tests/pytest/"
project_root_path = project_root
run_cwd = project_root
old_cwd = os.getcwd()
os.chdir(run_cwd)
fto_path = (
project_root / "code_to_optimize/bubble_sort_method.py"
).resolve()
original_code = fto_path.read_text("utf-8")
function_to_optimize = FunctionToOptimize(
"sorter",
fto_path,
parents=(FunctionParent("BubbleSorter", "ClassDef"),),
)
try:
# Write raw test, instrument it, then add decorator to source
test_path.write_text(raw_test_code, encoding="utf-8")
success, new_test = inject_profiling_into_existing_test(
test_path,
[CodePosition(6, 13)],
function_to_optimize,
project_root_path,
mode=TestingMode.BEHAVIOR,
)
assert success
assert new_test is not None
test_path.write_text(new_test, encoding="utf-8")
# Write the async helper file and add sync decorator to source
write_async_helper_file(project_root_path)
add_sync_decorator_to_function(
fto_path,
function_to_optimize,
mode=TestingMode.BEHAVIOR,
project_root=project_root_path,
)
test_config = TestConfig(
tests_root=tests_root,
tests_project_rootdir=project_root_path,
project_root_path=project_root_path,
test_framework="pytest",
pytest_cmd="pytest",
)
test_env = os.environ.copy()
test_env["CODEFLASH_TEST_ITERATION"] = "0"
test_env["CODEFLASH_LOOP_INDEX"] = "1"
test_type = TestType.EXISTING_UNIT_TEST
test_files = TestFiles(
test_files=[
TestFile(
instrumented_behavior_file_path=test_path,
test_type=test_type,
original_file_path=test_path,
benchmarking_file_path=test_path_perf,
)
]
)
xml_path, run_result, _, _ = run_behavioral_tests(
test_files=test_files,
test_env=test_env,
cwd=test_config.project_root_path,
pytest_cmd=test_config.pytest_cmd,
)
test_results = parse_test_results(
test_xml_path=xml_path,
test_files=test_files,
test_config=test_config,
optimization_iteration=0,
run_result=run_result,
)
assert test_results[0].id.function_getting_tested == "sorter"
assert (
test_results[0].stdout
== "codeflash stdout : BubbleSorter.sorter() called\n"
)
assert (
test_results[0].id.test_function_name == "test_single_element_list"
)
assert test_results[0].did_pass
# return_value is ((args, kwargs, return_value),) in the new path
assert test_results[0].return_value[0][2] == [42]
# Replace with optimized code that mutated instance attribute
optimized_code_mutated_attr = """
import sys
class BubbleSorter:
def __init__(self, x=1):
self.x = x
def sorter(self, arr):
print("BubbleSorter.sorter() called")
for i in range(len(arr)):
for j in range(len(arr) - 1):
if arr[j] > arr[j + 1]:
temp = arr[j]
arr[j] = arr[j + 1]
arr[j + 1] = temp
print("stderr test", file=sys.stderr)
return arr
"""
fto_path.write_text(optimized_code_mutated_attr, "utf-8")
# Re-add sync decorator to the new source
add_sync_decorator_to_function(
fto_path,
function_to_optimize,
mode=TestingMode.BEHAVIOR,
project_root=project_root_path,
)
xml_path, run_result, _, _ = run_behavioral_tests(
test_files=test_files,
test_env=test_env,
cwd=test_config.project_root_path,
pytest_cmd=test_config.pytest_cmd,
)
test_results_mutated_attr = parse_test_results(
test_xml_path=xml_path,
test_files=test_files,
test_config=test_config,
optimization_iteration=0,
run_result=run_result,
)
# In the new decorator-based path, args (including self) are captured,
# so init state changes ARE detected even without explicit codeflash_capture
match, _ = compare_test_results(
test_results, test_results_mutated_attr
)
assert not match
assert (
test_results_mutated_attr[0].stdout
== "BubbleSorter.sorter() called\n"
)
finally:
fto_path.write_text(original_code, "utf-8")
test_path.unlink(missing_ok=True)
test_path_perf.unlink(missing_ok=True)
(project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)
os.chdir(old_cwd)
def test_class_method_full_instrumentation() -> None:
"""Verifies full instrumentation with codeflash capture for instance state verification."""
raw_test_code = """from code_to_optimize.bubble_sort_method import BubbleSorter
def test_single_element_list():
obj = BubbleSorter()
result = obj.sorter([3, 2, 1])
"""
# Init paths
test_path = (
project_root
/ "code_to_optimize/tests/pytest/test_aiservice_behavior_results_temp.py"
).resolve()
test_path_perf = (
project_root
/ "code_to_optimize/tests/pytest/test_aiservice_behavior_results_perf_temp.py"
).resolve()
tests_root = project_root / "code_to_optimize/tests/pytest/"
project_root_path = project_root
fto_path = (
project_root / "code_to_optimize/bubble_sort_method.py"
).resolve()
original_code = fto_path.read_text("utf-8")
function_to_optimize = FunctionToOptimize(
"sorter",
fto_path,
parents=(FunctionParent("BubbleSorter", "ClassDef"),),
)
try:
# Write raw test, instrument it, then add decorator to source
test_path.write_text(raw_test_code, encoding="utf-8")
original_cwd = Path.cwd()
os.chdir(project_root_path)
success, new_test = inject_profiling_into_existing_test(
test_path,
[CodePosition(6, 13)],
function_to_optimize,
project_root_path,
mode=TestingMode.BEHAVIOR,
)
os.chdir(original_cwd)
assert success
assert new_test is not None
test_path.write_text(new_test, encoding="utf-8")
# Write the async helper file and add sync decorator to source
write_async_helper_file(project_root_path)
add_sync_decorator_to_function(
fto_path,
function_to_optimize,
mode=TestingMode.BEHAVIOR,
project_root=project_root_path,
)
# Add codeflash capture decorator for __init__ state tracking
instrument_codeflash_capture(function_to_optimize, {}, tests_root)
test_config = TestConfig(
tests_root=tests_root,
tests_project_rootdir=project_root_path,
project_root_path=project_root_path,
test_framework="pytest",
pytest_cmd="pytest",
)
test_env = os.environ.copy()
test_env["CODEFLASH_TEST_ITERATION"] = "0"
test_env["CODEFLASH_LOOP_INDEX"] = "1"
test_type = TestType.EXISTING_UNIT_TEST
test_files = TestFiles(
test_files=[
TestFile(
instrumented_behavior_file_path=test_path,
test_type=test_type,
original_file_path=test_path,
benchmarking_file_path=test_path_perf,
)
]
)
xml_path, run_result, _, _ = run_behavioral_tests(
test_files=test_files,
test_env=test_env,
cwd=test_config.project_root_path,
pytest_cmd=test_config.pytest_cmd,
)
test_results = parse_test_results(
test_xml_path=xml_path,
test_files=test_files,
test_config=test_config,
optimization_iteration=0,
run_result=run_result,
)
# Verify instance_state result (from codeflash_capture)
assert (
test_results[0].id.function_getting_tested
== "BubbleSorter.__init__"
)
assert (
test_results[0].id.test_function_name == "test_single_element_list"
)
assert test_results[0].did_pass
assert test_results[0].return_value[0] == {"x": 0}
assert test_results[0].stdout == ""
# Verify function_to_optimize result (from sync decorator)
assert test_results[1].id.function_getting_tested == "sorter"
assert (
test_results[1].id.test_function_name == "test_single_element_list"
)
assert test_results[1].did_pass
# return_value is ((args, kwargs, return_value),) in the new path
assert test_results[1].return_value[0][2] == [1, 2, 3]
assert (
test_results[1].stdout
== "codeflash stdout : BubbleSorter.sorter() called\n"
)
# Replace with optimized code that mutated instance attribute
optimized_code_mutated_attr = """
import sys
class BubbleSorter:
def __init__(self, x=1):
self.x = x
def sorter(self, arr):
print("BubbleSorter.sorter() called")
for i in range(len(arr)):
for j in range(len(arr) - 1):
if arr[j] > arr[j + 1]:
temp = arr[j]
arr[j] = arr[j + 1]
arr[j + 1] = temp
print("stderr test", file=sys.stderr)
return arr
"""
fto_path.write_text(optimized_code_mutated_attr, "utf-8")
# Force reload of module
module_name = "code_to_optimize.bubble_sort_method"
if module_name not in sys.modules:
__import__(module_name)
importlib.reload(sys.modules[module_name])
# Re-add sync decorator and codeflash capture to the new source
add_sync_decorator_to_function(
fto_path,
function_to_optimize,
mode=TestingMode.BEHAVIOR,
project_root=project_root_path,
)
instrument_codeflash_capture(function_to_optimize, {}, tests_root)
xml_path, run_result, _, _ = run_behavioral_tests(
test_files=test_files,
test_env=test_env,
cwd=test_config.project_root_path,
pytest_cmd=test_config.pytest_cmd,
)
test_results_mutated_attr = parse_test_results(
test_xml_path=xml_path,
test_files=test_files,
test_config=test_config,
optimization_iteration=0,
run_result=run_result,
)
assert (
test_results_mutated_attr[0].id.function_getting_tested
== "BubbleSorter.__init__"
)
assert test_results_mutated_attr[0].return_value[0] == {"x": 1}
assert (
test_results_mutated_attr[0].verification_type
== VerificationType.INIT_STATE_FTO
)
assert test_results_mutated_attr[0].stdout == ""
# The test should fail because the instance attribute was mutated
match, _ = compare_test_results(
test_results, test_results_mutated_attr
)
assert not match
# Replace with optimized code that did not mutate existing
# instance attribute, but added a new one
optimized_code_new_attr = """
import sys
class BubbleSorter:
def __init__(self, x=0):
self.x = x
self.y = 2
def sorter(self, arr):
print("BubbleSorter.sorter() called")
for i in range(len(arr)):
for j in range(len(arr) - 1):
if arr[j] > arr[j + 1]:
temp = arr[j]
arr[j] = arr[j + 1]
arr[j + 1] = temp
print("stderr test", file=sys.stderr)
return arr
"""
fto_path.write_text(optimized_code_new_attr, "utf-8")
importlib.reload(sys.modules[module_name])
# Re-add sync decorator and codeflash capture
add_sync_decorator_to_function(
fto_path,
function_to_optimize,
mode=TestingMode.BEHAVIOR,
project_root=project_root_path,
)
instrument_codeflash_capture(function_to_optimize, {}, tests_root)
xml_path, run_result, _, _ = run_behavioral_tests(
test_files=test_files,
test_env=test_env,
cwd=test_config.project_root_path,
pytest_cmd=test_config.pytest_cmd,
)
test_results_new_attr = parse_test_results(
test_xml_path=xml_path,
test_files=test_files,
test_config=test_config,
optimization_iteration=0,
run_result=run_result,
)
assert (
test_results_new_attr[0].id.function_getting_tested
== "BubbleSorter.__init__"
)
assert test_results_new_attr[0].return_value[0] == {"x": 0, "y": 2}
assert (
test_results_new_attr[0].verification_type
== VerificationType.INIT_STATE_FTO
)
assert test_results_new_attr[0].stdout == ""
# In the new decorator-based path, args (including self) are captured.
# Adding a new instance attribute changes self, so the comparison
# detects a difference even though codeflash_capture considers it additive.
match, _ = compare_test_results(test_results, test_results_new_attr)
assert not match
finally:
fto_path.write_text(original_code, "utf-8")
test_path.unlink(missing_ok=True)
test_path_perf.unlink(missing_ok=True)
(project_root / "codeflash_async_wrapper.py").unlink(missing_ok=True)