codeflash/tests/test_critic.py

790 lines
27 KiB
Python
Raw Normal View History

2024-10-15 00:08:46 +00:00
import os
from pathlib import Path
from unittest.mock import Mock
2024-10-15 00:08:46 +00:00
from codeflash.code_utils.env_utils import get_pr_number
from codeflash.models.models import (
CodeOptimizationContext,
2026-01-08 02:35:50 +00:00
ConcurrencyMetrics,
CoverageData,
CoverageStatus,
FunctionCoverage,
2025-03-28 22:26:27 +00:00
FunctionTestInvocation,
InvocationId,
OptimizedCandidateResult,
2025-03-28 22:26:27 +00:00
TestResults,
TestType,
)
2025-09-26 23:31:54 +00:00
from codeflash.result.critic import (
2026-01-08 02:35:50 +00:00
concurrency_gain,
2025-09-26 23:31:54 +00:00
coverage_critic,
performance_gain,
quantity_of_tests_critic,
speedup_critic,
throughput_gain,
)
2026-01-08 02:35:50 +00:00
from codeflash.verification.parse_test_output import parse_concurrency_metrics
2024-05-31 01:28:36 +00:00
def test_performance_gain() -> None:
2024-10-28 04:52:47 +00:00
assert performance_gain(original_runtime_ns=1000, optimized_runtime_ns=0) == 0.0
assert performance_gain(original_runtime_ns=1000, optimized_runtime_ns=500) == 1.0
assert performance_gain(original_runtime_ns=1000, optimized_runtime_ns=900) == 0.1111111111111111
assert performance_gain(original_runtime_ns=1000, optimized_runtime_ns=1000) == 0.0
assert performance_gain(original_runtime_ns=1000, optimized_runtime_ns=1100) == -0.09090909090909091
def test_speedup_critic() -> None:
2024-05-31 01:28:36 +00:00
original_code_runtime = 1000
best_runtime_until_now = 1000
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-05-31 01:28:36 +00:00
best_test_runtime=800,
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
optimization_candidate_index=0,
total_candidate_timing=12,
2024-05-31 01:28:36 +00:00
)
2025-08-06 04:23:20 +00:00
assert speedup_critic(candidate_result, original_code_runtime, best_runtime_until_now, disable_gh_action_noise=True) # 20% improvement
2024-05-31 01:28:36 +00:00
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-05-31 01:28:36 +00:00
best_test_runtime=940,
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
2024-05-31 01:28:36 +00:00
)
2025-08-06 04:23:20 +00:00
assert not speedup_critic(candidate_result, original_code_runtime, best_runtime_until_now, disable_gh_action_noise=True) # 6% improvement
2024-05-31 01:28:36 +00:00
original_code_runtime = 100000
best_runtime_until_now = 100000
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-05-31 01:28:36 +00:00
best_test_runtime=94000,
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
2024-05-31 01:28:36 +00:00
)
2025-08-06 04:23:20 +00:00
assert speedup_critic(candidate_result, original_code_runtime, best_runtime_until_now, disable_gh_action_noise=True) # 6% improvement
def test_generated_test_critic() -> None:
test_1 = FunctionTestInvocation(
id=InvocationId(
test_module_path="",
test_class_name="",
test_function_name="test_1",
function_getting_tested="sorter",
iteration_id="",
),
file_name=Path("test_1"),
did_pass=True,
runtime=0,
test_framework="pytest",
test_type=TestType.GENERATED_REGRESSION,
return_value=None,
timed_out=False,
2024-10-03 19:06:48 +00:00
loop_index=1,
)
test_2 = FunctionTestInvocation(
id=InvocationId(
test_module_path="",
test_class_name="",
test_function_name="test_2",
function_getting_tested="sorter",
iteration_id="",
),
file_name=Path("test_2"),
did_pass=True,
runtime=0,
test_framework="pytest",
test_type=TestType.GENERATED_REGRESSION,
return_value=None,
timed_out=False,
2024-10-03 19:06:48 +00:00
loop_index=1,
)
test_3 = FunctionTestInvocation(
id=InvocationId(
test_module_path="",
test_class_name="",
test_function_name="test_3",
function_getting_tested="sorter",
iteration_id="",
),
file_name=Path("test_3"),
did_pass=True,
runtime=0,
test_framework="pytest",
test_type=TestType.EXISTING_UNIT_TEST,
return_value=None,
timed_out=False,
2024-10-03 19:06:48 +00:00
loop_index=1,
)
test_4 = FunctionTestInvocation(
id=InvocationId(
test_module_path="",
test_class_name="",
test_function_name="test_4",
function_getting_tested="sorter",
iteration_id="",
),
file_name=Path("test_4"),
did_pass=False,
runtime=0,
test_framework="pytest",
test_type=TestType.GENERATED_REGRESSION,
return_value=None,
timed_out=False,
2024-10-03 19:06:48 +00:00
loop_index=1,
)
2024-08-13 17:43:34 +00:00
test_5 = FunctionTestInvocation(
id=InvocationId(
test_module_path="",
test_class_name="",
test_function_name="test_5",
function_getting_tested="sorter",
iteration_id="",
),
file_name=Path("test_5"),
2024-08-13 17:43:34 +00:00
did_pass=True,
runtime=0,
test_framework="pytest",
test_type=TestType.REPLAY_TEST,
return_value=None,
timed_out=False,
2024-10-03 19:06:48 +00:00
loop_index=1,
2024-08-13 17:43:34 +00:00
)
test_6 = FunctionTestInvocation(
id=InvocationId(
test_module_path="",
test_class_name="",
test_function_name="test_6",
function_getting_tested="sorter",
iteration_id="",
),
file_name=Path("test_6"),
did_pass=True,
runtime=0,
test_framework="pytest",
test_type=TestType.GENERATED_REGRESSION,
return_value=None,
timed_out=False,
loop_index=2,
)
2025-01-24 22:49:14 +00:00
test_7 = FunctionTestInvocation(
id=InvocationId(
test_module_path="",
test_class_name="",
test_function_name="test_7",
function_getting_tested="sorter",
iteration_id="",
),
file_name=Path("test_7"),
did_pass=True,
runtime=0,
test_framework="pytest",
test_type=TestType.EXISTING_UNIT_TEST,
return_value=None,
timed_out=False,
loop_index=1,
)
2025-06-02 02:15:24 +00:00
test_results = [test_1, test_2, test_3, test_4, test_5, test_6, test_7, test_1]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
)
2024-08-22 07:44:46 +00:00
assert quantity_of_tests_critic(candidate_result)
2025-06-02 02:15:24 +00:00
test_results = [test_1, test_2, test_3, test_6, test_7, test_1, test_4, test_1]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
)
2024-08-22 07:44:46 +00:00
assert quantity_of_tests_critic(candidate_result)
2025-06-02 02:15:24 +00:00
test_results = [test_1, test_3, test_4, test_2, test_7, test_1, test_6, test_1]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
)
2024-08-22 07:44:46 +00:00
assert quantity_of_tests_critic(candidate_result)
test_results = [test_1]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
)
2024-08-22 07:44:46 +00:00
assert not quantity_of_tests_critic(candidate_result)
2025-06-02 02:15:24 +00:00
test_results = [test_1, test_2, test_3, test_4, test_5, test_1, test_1, test_1]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
)
2024-08-22 07:44:46 +00:00
assert quantity_of_tests_critic(candidate_result)
test_results = [test_1, test_4, test_6]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
)
2024-08-22 07:44:46 +00:00
assert not quantity_of_tests_critic(candidate_result)
2024-08-13 17:43:34 +00:00
test_results = [test_4, test_5]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-08-13 17:43:34 +00:00
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
2024-08-13 17:43:34 +00:00
)
2024-08-22 07:44:46 +00:00
assert quantity_of_tests_critic(candidate_result)
2024-08-13 17:43:34 +00:00
2025-06-02 02:15:24 +00:00
test_results = [test_1, test_2, test_3, test_4, test_5, test_1, test_1, test_1]
2024-08-13 17:43:34 +00:00
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-08-13 17:43:34 +00:00
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
2024-08-13 17:43:34 +00:00
)
2024-08-22 07:44:46 +00:00
assert quantity_of_tests_critic(candidate_result)
2024-10-15 00:08:46 +00:00
get_pr_number.cache_clear()
os.environ["CODEFLASH_PR_NUMBER"] = "1234"
test_results = [test_1, test_2, test_3, test_6]
2024-10-15 00:08:46 +00:00
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-10-15 00:08:46 +00:00
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
2024-10-15 00:08:46 +00:00
)
assert not quantity_of_tests_critic(candidate_result)
test_results = [test_1, test_2, test_3, test_4]
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-10-15 00:08:46 +00:00
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
2024-10-15 00:08:46 +00:00
)
assert not quantity_of_tests_critic(candidate_result)
2025-06-02 02:15:24 +00:00
test_results = [test_1, test_2, test_3, test_5, test_1, test_1, test_1, test_1]
2024-10-15 00:08:46 +00:00
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
2024-10-15 00:08:46 +00:00
best_test_runtime=100,
behavior_test_results=TestResults(test_results=test_results),
benchmarking_test_results=TestResults(),
2024-10-27 21:43:28 +00:00
total_candidate_timing=12,
optimization_candidate_index=0,
2024-10-15 00:08:46 +00:00
)
assert quantity_of_tests_critic(candidate_result)
del os.environ["CODEFLASH_PR_NUMBER"]
def test_coverage_critic() -> None:
mock_code_context = Mock(spec=CodeOptimizationContext)
passing_coverage = CoverageData(
file_path=Path("test_file.py"),
coverage=100.0,
function_name="test_function",
functions_being_tested=["function1", "function2"],
graph={},
code_context=mock_code_context,
main_func_coverage=FunctionCoverage(
name="test_function",
coverage=100.0,
executed_lines=[10],
unexecuted_lines=[2],
executed_branches=[[5]],
2025-01-24 22:49:14 +00:00
unexecuted_branches=[[1]],
),
dependent_func_coverage=None,
2025-01-24 22:49:14 +00:00
status=CoverageStatus.PARSED_SUCCESSFULLY,
)
remove test_framework from pyproject.toml (#955) * follow up * remove requirement * Delete uv.lock * refresh uv-lock * first pass * cleanup test_framework here * cleanup * code_review * cleanup tests * fix for E2E * fix tests dir missing * one more cleanup * cancel-in-progress * Revert "cancel-in-progress" This reverts commit f4bb9079cb1f914670365c0b47718c4c6ea71970. * not needed here * lower threshold and cleanup comments * debug * temp * debug Revert "debug" This reverts commit fc3655149486c8b980e245e97b8304232086f08d. fix(discover): Fix pytest discovery for futurehouse structure Revert "fix(discover): Fix pytest discovery for futurehouse structure" This reverts commit 40c48882b7413f5876af0e2e08d8f17a65bab091. Reapply "debug" This reverts commit c8297e57fbdca2462a8ca1199657748b8bc225e9. Revert "not needed here" This reverts commit dd2c5cdf76c8ededccd942954b2c75ba4b4101b2. Revert "lower threshold and cleanup comments" This reverts commit 0e2f57e2924b4a5ba084a6e2cc0c29ca2c19c634. Reapply "lower threshold and cleanup comments" This reverts commit e3b24f4a2967551eca8a19f96bf6647b23acdbbc. Reapply "not needed here" This reverts commit aec32103c931ff6d57dfa0d012113c2cec5d37a7. Revert "Reapply "debug"" This reverts commit 77ab9f34f858a17fb29764c544769a0eb72ce7f0. Reapply "fix(discover): Fix pytest discovery for futurehouse structure" This reverts commit 506b94ab4fe17a7c8e0d458253812758cced3f22. feat(futurehouse): Make futurehouse structure pytest compatible * Revert "debug" This reverts commit 271c5a37ec67c75e9b6912889d9e1472c226250c. * Revert "temp" This reverts commit b363acda1c1a89ddcc4831dcfa8f6e6634ab3d2d. * Revert "debug" This reverts commit ac29b6beb387f05dd34f1cfa52d46a8105500242. * just for now
2025-12-09 10:53:08 +00:00
assert coverage_critic(passing_coverage) is True
border_coverage = CoverageData(
file_path=Path("test_file.py"),
2025-02-11 05:29:52 +00:00
coverage=60.0,
function_name="test_function",
functions_being_tested=["function1", "function2"],
graph={},
code_context=mock_code_context,
main_func_coverage=FunctionCoverage(
name="test_function",
coverage=50.0,
executed_lines=[10],
unexecuted_lines=[2],
executed_branches=[[5]],
2025-01-24 22:49:14 +00:00
unexecuted_branches=[[1]],
),
dependent_func_coverage=None,
2025-01-24 22:49:14 +00:00
status=CoverageStatus.PARSED_SUCCESSFULLY,
)
remove test_framework from pyproject.toml (#955) * follow up * remove requirement * Delete uv.lock * refresh uv-lock * first pass * cleanup test_framework here * cleanup * code_review * cleanup tests * fix for E2E * fix tests dir missing * one more cleanup * cancel-in-progress * Revert "cancel-in-progress" This reverts commit f4bb9079cb1f914670365c0b47718c4c6ea71970. * not needed here * lower threshold and cleanup comments * debug * temp * debug Revert "debug" This reverts commit fc3655149486c8b980e245e97b8304232086f08d. fix(discover): Fix pytest discovery for futurehouse structure Revert "fix(discover): Fix pytest discovery for futurehouse structure" This reverts commit 40c48882b7413f5876af0e2e08d8f17a65bab091. Reapply "debug" This reverts commit c8297e57fbdca2462a8ca1199657748b8bc225e9. Revert "not needed here" This reverts commit dd2c5cdf76c8ededccd942954b2c75ba4b4101b2. Revert "lower threshold and cleanup comments" This reverts commit 0e2f57e2924b4a5ba084a6e2cc0c29ca2c19c634. Reapply "lower threshold and cleanup comments" This reverts commit e3b24f4a2967551eca8a19f96bf6647b23acdbbc. Reapply "not needed here" This reverts commit aec32103c931ff6d57dfa0d012113c2cec5d37a7. Revert "Reapply "debug"" This reverts commit 77ab9f34f858a17fb29764c544769a0eb72ce7f0. Reapply "fix(discover): Fix pytest discovery for futurehouse structure" This reverts commit 506b94ab4fe17a7c8e0d458253812758cced3f22. feat(futurehouse): Make futurehouse structure pytest compatible * Revert "debug" This reverts commit 271c5a37ec67c75e9b6912889d9e1472c226250c. * Revert "temp" This reverts commit b363acda1c1a89ddcc4831dcfa8f6e6634ab3d2d. * Revert "debug" This reverts commit ac29b6beb387f05dd34f1cfa52d46a8105500242. * just for now
2025-12-09 10:53:08 +00:00
assert coverage_critic(border_coverage) is True
failing_coverage = CoverageData(
file_path=Path("test_file.py"),
coverage=30.0,
function_name="test_function",
functions_being_tested=["function1", "function2"],
graph={},
code_context=mock_code_context,
main_func_coverage=FunctionCoverage(
name="test_function",
coverage=0.0,
executed_lines=[],
unexecuted_lines=[10],
executed_branches=[],
2025-01-24 22:49:14 +00:00
unexecuted_branches=[[5]],
),
dependent_func_coverage=None,
2025-01-24 22:49:14 +00:00
status=CoverageStatus.PARSED_SUCCESSFULLY,
)
remove test_framework from pyproject.toml (#955) * follow up * remove requirement * Delete uv.lock * refresh uv-lock * first pass * cleanup test_framework here * cleanup * code_review * cleanup tests * fix for E2E * fix tests dir missing * one more cleanup * cancel-in-progress * Revert "cancel-in-progress" This reverts commit f4bb9079cb1f914670365c0b47718c4c6ea71970. * not needed here * lower threshold and cleanup comments * debug * temp * debug Revert "debug" This reverts commit fc3655149486c8b980e245e97b8304232086f08d. fix(discover): Fix pytest discovery for futurehouse structure Revert "fix(discover): Fix pytest discovery for futurehouse structure" This reverts commit 40c48882b7413f5876af0e2e08d8f17a65bab091. Reapply "debug" This reverts commit c8297e57fbdca2462a8ca1199657748b8bc225e9. Revert "not needed here" This reverts commit dd2c5cdf76c8ededccd942954b2c75ba4b4101b2. Revert "lower threshold and cleanup comments" This reverts commit 0e2f57e2924b4a5ba084a6e2cc0c29ca2c19c634. Reapply "lower threshold and cleanup comments" This reverts commit e3b24f4a2967551eca8a19f96bf6647b23acdbbc. Reapply "not needed here" This reverts commit aec32103c931ff6d57dfa0d012113c2cec5d37a7. Revert "Reapply "debug"" This reverts commit 77ab9f34f858a17fb29764c544769a0eb72ce7f0. Reapply "fix(discover): Fix pytest discovery for futurehouse structure" This reverts commit 506b94ab4fe17a7c8e0d458253812758cced3f22. feat(futurehouse): Make futurehouse structure pytest compatible * Revert "debug" This reverts commit 271c5a37ec67c75e9b6912889d9e1472c226250c. * Revert "temp" This reverts commit b363acda1c1a89ddcc4831dcfa8f6e6634ab3d2d. * Revert "debug" This reverts commit ac29b6beb387f05dd34f1cfa52d46a8105500242. * just for now
2025-12-09 10:53:08 +00:00
assert coverage_critic(failing_coverage) is False
2025-09-26 23:31:54 +00:00
def test_throughput_gain() -> None:
"""Test throughput_gain calculation."""
# Test basic throughput improvement
assert throughput_gain(original_throughput=100, optimized_throughput=150) == 0.5 # 50% improvement
# Test no improvement
assert throughput_gain(original_throughput=100, optimized_throughput=100) == 0.0
# Test regression
assert throughput_gain(original_throughput=100, optimized_throughput=80) == -0.2 # 20% regression
# Test zero original throughput (edge case)
assert throughput_gain(original_throughput=0, optimized_throughput=50) == 0.0
# Test large improvement
assert throughput_gain(original_throughput=50, optimized_throughput=200) == 3.0 # 300% improvement
def test_speedup_critic_with_async_throughput() -> None:
"""Test speedup_critic with async throughput evaluation."""
original_code_runtime = 10000 # 10 microseconds
original_async_throughput = 100
# Test case 1: Both runtime and throughput improve significantly
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=8000, # 20% runtime improvement
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=8000,
async_throughput=120, # 20% throughput improvement
)
assert speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
disable_gh_action_noise=True
)
# Test case 2: Runtime improves significantly, throughput doesn't meet threshold (should pass)
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=8000, # 20% runtime improvement
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=8000,
async_throughput=105, # Only 5% throughput improvement (below 10% threshold)
)
assert speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
disable_gh_action_noise=True
)
# Test case 3: Throughput improves significantly, runtime doesn't meet threshold (should pass)
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=9800, # Only 2% runtime improvement (below 5% threshold)
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=9800,
async_throughput=120, # 20% throughput improvement
)
assert speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
disable_gh_action_noise=True
)
# Test case 4: No throughput data - should fall back to runtime-only evaluation
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=8000, # 20% runtime improvement
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=8000,
async_throughput=None, # No throughput data
)
assert speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=None, # No original throughput data
best_throughput_until_now=None,
disable_gh_action_noise=True
)
# Test case 5: Test best_throughput_until_now comparison
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=8000, # 20% runtime improvement
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=8000,
async_throughput=115, # 15% throughput improvement
)
# Should pass when no best throughput yet
assert speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
disable_gh_action_noise=True
)
# Should fail when there's a better throughput already
assert not speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=7000, # Better runtime already exists
original_async_throughput=original_async_throughput,
best_throughput_until_now=120, # Better throughput already exists
disable_gh_action_noise=True
)
# Test case 6: Zero original throughput (edge case)
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=8000, # 20% runtime improvement
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=8000,
async_throughput=50,
)
# Should pass when original throughput is 0 (throughput evaluation skipped)
assert speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=0, # Zero original throughput
best_throughput_until_now=None,
disable_gh_action_noise=True
)
2026-01-08 02:35:50 +00:00
def test_concurrency_gain() -> None:
"""Test concurrency_gain calculation."""
# Test basic concurrency improvement (blocking -> non-blocking)
original = ConcurrencyMetrics(
sequential_time_ns=10_000_000, # 10ms
concurrent_time_ns=10_000_000, # 10ms (no speedup - blocking)
concurrency_factor=10,
concurrency_ratio=1.0, # sequential/concurrent = 1.0
)
optimized = ConcurrencyMetrics(
sequential_time_ns=10_000_000, # 10ms
concurrent_time_ns=1_000_000, # 1ms (10x speedup - non-blocking)
concurrency_factor=10,
concurrency_ratio=10.0, # sequential/concurrent = 10.0
)
# 900% improvement: (10 - 1) / 1 = 9.0
assert concurrency_gain(original, optimized) == 9.0
# Test no improvement
same = ConcurrencyMetrics(
sequential_time_ns=10_000_000,
concurrent_time_ns=10_000_000,
concurrency_factor=10,
concurrency_ratio=1.0,
)
assert concurrency_gain(original, same) == 0.0
# Test slight improvement
slightly_better = ConcurrencyMetrics(
sequential_time_ns=10_000_000,
concurrent_time_ns=8_000_000,
concurrency_factor=10,
concurrency_ratio=1.25,
)
# 25% improvement: (1.25 - 1.0) / 1.0 = 0.25
assert concurrency_gain(original, slightly_better) == 0.25
# Test zero original ratio (edge case)
zero_ratio = ConcurrencyMetrics(
sequential_time_ns=0,
concurrent_time_ns=1_000_000,
concurrency_factor=10,
concurrency_ratio=0.0,
)
assert concurrency_gain(zero_ratio, optimized) == 0.0
def test_speedup_critic_with_concurrency_metrics() -> None:
"""Test speedup_critic with concurrency metrics evaluation."""
original_code_runtime = 10000 # 10 microseconds
original_async_throughput = 100
# Original concurrency metrics (blocking code - ratio ~= 1.0)
original_concurrency = ConcurrencyMetrics(
sequential_time_ns=10_000_000,
concurrent_time_ns=10_000_000,
concurrency_factor=10,
concurrency_ratio=1.0,
)
# Test case 1: Concurrency improves significantly (blocking -> non-blocking)
candidate_result = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=10000, # Same runtime
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=10000,
async_throughput=100, # Same throughput
concurrency_metrics=ConcurrencyMetrics(
sequential_time_ns=10_000_000,
concurrent_time_ns=1_000_000, # 10x faster concurrent execution
concurrency_factor=10,
concurrency_ratio=10.0, # 900% improvement
),
)
# Should pass due to concurrency improvement even though runtime/throughput unchanged
assert speedup_critic(
candidate_result=candidate_result,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
original_concurrency_metrics=original_concurrency,
best_concurrency_ratio_until_now=None,
disable_gh_action_noise=True,
)
# Test case 2: No concurrency improvement (should fall back to other metrics)
candidate_result_no_conc = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=8000, # 20% runtime improvement
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=8000,
async_throughput=100,
concurrency_metrics=ConcurrencyMetrics(
sequential_time_ns=10_000_000,
concurrent_time_ns=10_000_000,
concurrency_factor=10,
concurrency_ratio=1.0, # No improvement
),
)
# Should pass due to runtime improvement
assert speedup_critic(
candidate_result=candidate_result_no_conc,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
original_concurrency_metrics=original_concurrency,
best_concurrency_ratio_until_now=None,
disable_gh_action_noise=True,
)
# Test case 3: Concurrency below threshold (20% required)
candidate_result_below_threshold = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=10000, # Same runtime
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=10000,
async_throughput=100, # Same throughput
concurrency_metrics=ConcurrencyMetrics(
sequential_time_ns=10_000_000,
concurrent_time_ns=9_000_000, # Only 11% improvement
concurrency_factor=10,
concurrency_ratio=1.11,
),
)
# Should fail - no metric improves enough
assert not speedup_critic(
candidate_result=candidate_result_below_threshold,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
original_concurrency_metrics=original_concurrency,
best_concurrency_ratio_until_now=None,
disable_gh_action_noise=True,
)
# Test case 4: best_concurrency_ratio_until_now comparison
candidate_result_good = OptimizedCandidateResult(
max_loop_count=5,
best_test_runtime=10000,
behavior_test_results=TestResults(),
benchmarking_test_results=TestResults(),
optimization_candidate_index=0,
total_candidate_timing=10000,
async_throughput=100,
concurrency_metrics=ConcurrencyMetrics(
sequential_time_ns=10_000_000,
concurrent_time_ns=2_000_000,
concurrency_factor=10,
concurrency_ratio=5.0,
),
)
# Should fail when there's a better concurrency ratio already
assert not speedup_critic(
candidate_result=candidate_result_good,
original_code_runtime=original_code_runtime,
best_runtime_until_now=None,
original_async_throughput=original_async_throughput,
best_throughput_until_now=None,
original_concurrency_metrics=original_concurrency,
best_concurrency_ratio_until_now=10.0, # Better ratio already exists
disable_gh_action_noise=True,
)
def test_parse_concurrency_metrics() -> None:
"""Test parse_concurrency_metrics function."""
# Test with valid concurrency output
stdout = (
"!@######CONC:test_module:TestClass:test_func:my_function:0:10000000:1000000:10######@!\n"
"!@######CONC:test_module:TestClass:test_func:my_function:1:10000000:1000000:10######@!\n"
)
test_results = TestResults(perf_stdout=stdout)
metrics = parse_concurrency_metrics(test_results, "my_function")
assert metrics is not None
assert metrics.sequential_time_ns == 10_000_000 # Average of both matches
assert metrics.concurrent_time_ns == 1_000_000
assert metrics.concurrency_factor == 10
assert metrics.concurrency_ratio == 10.0 # 10000000 / 1000000
# Test with no matching function
metrics_wrong_func = parse_concurrency_metrics(test_results, "other_function")
assert metrics_wrong_func is None
# Test with empty stdout
empty_results = TestResults(perf_stdout="")
metrics_empty = parse_concurrency_metrics(empty_results, "my_function")
assert metrics_empty is None
# Test with None stdout
none_results = TestResults(perf_stdout=None)
metrics_none = parse_concurrency_metrics(none_results, "my_function")
assert metrics_none is None
# Test with no class name
stdout_no_class = "!@######CONC:test_module::test_func:my_function:0:5000000:2500000:10######@!\n"
test_results_no_class = TestResults(perf_stdout=stdout_no_class)
metrics_no_class = parse_concurrency_metrics(test_results_no_class, "my_function")
assert metrics_no_class is not None
assert metrics_no_class.concurrency_ratio == 2.0 # 5000000 / 2500000