from __future__ import annotations import os from pathlib import Path from unittest.mock import Mock from codeflash_core import performance_gain from codeflash_python.analysis._coverage import ( CoverageData, CoverageStatus, FunctionCoverage, ) from codeflash_python.benchmarking.models import ConcurrencyMetrics from codeflash_python.context.models import CodeOptimizationContext from codeflash_python.test_discovery.models import TestType from codeflash_python.testing.models import ( FunctionTestInvocation, InvocationId, TestResults, ) from codeflash_python.verification._critic import ( concurrency_gain, coverage_critic, get_pr_number, quantity_of_tests_critic, speedup_critic, throughput_gain, ) from codeflash_python.verification.models import OptimizedCandidateResult def test_performance_gain() -> None: """performance_gain returns the correct relative speedup.""" assert ( performance_gain(original_runtime_ns=1000, optimized_runtime_ns=0) == 0.0 ) assert ( performance_gain(original_runtime_ns=1000, optimized_runtime_ns=500) == 1.0 ) assert ( performance_gain(original_runtime_ns=1000, optimized_runtime_ns=900) == 0.1111111111111111 ) assert ( performance_gain(original_runtime_ns=1000, optimized_runtime_ns=1000) == 0.0 ) assert ( performance_gain(original_runtime_ns=1000, optimized_runtime_ns=1100) == -0.09090909090909091 ) def test_speedup_critic() -> None: """speedup_critic accepts candidates above the noise floor.""" original_code_runtime = 1000 best_runtime_until_now = 1000 candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=800, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=12, ) assert speedup_critic( candidate_result, original_code_runtime, best_runtime_until_now, disable_gh_action_noise=True, ) # 20% improvement candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=940, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert not speedup_critic( candidate_result, original_code_runtime, best_runtime_until_now, disable_gh_action_noise=True, ) # 6% improvement original_code_runtime = 100000 best_runtime_until_now = 100000 candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=94000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert speedup_critic( candidate_result, original_code_runtime, best_runtime_until_now, disable_gh_action_noise=True, ) # 6% improvement def test_generated_test_critic() -> None: """quantity_of_tests_critic requires enough passing tests.""" test_1 = FunctionTestInvocation( id=InvocationId( test_module_path="", test_class_name="", test_function_name="test_1", function_getting_tested="sorter", iteration_id="", ), file_name=Path("test_1"), did_pass=True, runtime=0, test_framework="pytest", test_type=TestType.GENERATED_REGRESSION, return_value=None, cpu_runtime=0, timed_out=False, loop_index=1, ) test_2 = FunctionTestInvocation( id=InvocationId( test_module_path="", test_class_name="", test_function_name="test_2", function_getting_tested="sorter", iteration_id="", ), file_name=Path("test_2"), did_pass=True, runtime=0, test_framework="pytest", test_type=TestType.GENERATED_REGRESSION, return_value=None, cpu_runtime=0, timed_out=False, loop_index=1, ) test_3 = FunctionTestInvocation( id=InvocationId( test_module_path="", test_class_name="", test_function_name="test_3", function_getting_tested="sorter", iteration_id="", ), file_name=Path("test_3"), did_pass=True, runtime=0, test_framework="pytest", test_type=TestType.EXISTING_UNIT_TEST, return_value=None, cpu_runtime=0, timed_out=False, loop_index=1, ) test_4 = FunctionTestInvocation( id=InvocationId( test_module_path="", test_class_name="", test_function_name="test_4", function_getting_tested="sorter", iteration_id="", ), file_name=Path("test_4"), did_pass=False, runtime=0, test_framework="pytest", test_type=TestType.GENERATED_REGRESSION, return_value=None, cpu_runtime=0, timed_out=False, loop_index=1, ) test_5 = FunctionTestInvocation( id=InvocationId( test_module_path="", test_class_name="", test_function_name="test_5", function_getting_tested="sorter", iteration_id="", ), file_name=Path("test_5"), did_pass=True, runtime=0, test_framework="pytest", test_type=TestType.REPLAY_TEST, return_value=None, cpu_runtime=0, timed_out=False, loop_index=1, ) test_6 = FunctionTestInvocation( id=InvocationId( test_module_path="", test_class_name="", test_function_name="test_6", function_getting_tested="sorter", iteration_id="", ), file_name=Path("test_6"), did_pass=True, runtime=0, test_framework="pytest", test_type=TestType.GENERATED_REGRESSION, return_value=None, cpu_runtime=0, timed_out=False, loop_index=2, ) test_7 = FunctionTestInvocation( id=InvocationId( test_module_path="", test_class_name="", test_function_name="test_7", function_getting_tested="sorter", iteration_id="", ), file_name=Path("test_7"), did_pass=True, runtime=0, test_framework="pytest", test_type=TestType.EXISTING_UNIT_TEST, return_value=None, cpu_runtime=0, timed_out=False, loop_index=1, ) test_results = [ test_1, test_2, test_3, test_4, test_5, test_6, test_7, test_1, ] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert quantity_of_tests_critic(candidate_result) test_results = [ test_1, test_2, test_3, test_6, test_7, test_1, test_4, test_1, ] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert quantity_of_tests_critic(candidate_result) test_results = [ test_1, test_3, test_4, test_2, test_7, test_1, test_6, test_1, ] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert quantity_of_tests_critic(candidate_result) test_results = [test_1] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert not quantity_of_tests_critic(candidate_result) test_results = [ test_1, test_2, test_3, test_4, test_5, test_1, test_1, test_1, ] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert quantity_of_tests_critic(candidate_result) test_results = [test_1, test_4, test_6] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert not quantity_of_tests_critic(candidate_result) test_results = [test_4, test_5] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert quantity_of_tests_critic(candidate_result) test_results = [ test_1, test_2, test_3, test_4, test_5, test_1, test_1, test_1, ] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert quantity_of_tests_critic(candidate_result) get_pr_number.cache_clear() os.environ["CODEFLASH_PR_NUMBER"] = "1234" test_results = [test_1, test_2, test_3, test_6] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert not quantity_of_tests_critic(candidate_result) test_results = [test_1, test_2, test_3, test_4] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert not quantity_of_tests_critic(candidate_result) test_results = [ test_1, test_2, test_3, test_5, test_1, test_1, test_1, test_1, ] candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=100, behavior_test_results=TestResults(test_results=test_results), benchmarking_test_results=TestResults(), total_candidate_timing=12, optimization_candidate_index=0, ) assert quantity_of_tests_critic(candidate_result) del os.environ["CODEFLASH_PR_NUMBER"] def test_coverage_critic() -> None: """coverage_critic passes when coverage is above the threshold.""" mock_code_context = Mock(spec=CodeOptimizationContext) passing_coverage = CoverageData( file_path=Path("test_file.py"), coverage=100.0, function_name="test_function", functions_being_tested=["function1", "function2"], graph={}, code_context=mock_code_context, main_func_coverage=FunctionCoverage( name="test_function", coverage=100.0, executed_lines=[10], unexecuted_lines=[2], executed_branches=[[5]], unexecuted_branches=[[1]], ), dependent_func_coverage=None, status=CoverageStatus.PARSED_SUCCESSFULLY, ) assert coverage_critic(passing_coverage) is True border_coverage = CoverageData( file_path=Path("test_file.py"), coverage=60.0, function_name="test_function", functions_being_tested=["function1", "function2"], graph={}, code_context=mock_code_context, main_func_coverage=FunctionCoverage( name="test_function", coverage=50.0, executed_lines=[10], unexecuted_lines=[2], executed_branches=[[5]], unexecuted_branches=[[1]], ), dependent_func_coverage=None, status=CoverageStatus.PARSED_SUCCESSFULLY, ) assert coverage_critic(border_coverage) is True failing_coverage = CoverageData( file_path=Path("test_file.py"), coverage=30.0, function_name="test_function", functions_being_tested=["function1", "function2"], graph={}, code_context=mock_code_context, main_func_coverage=FunctionCoverage( name="test_function", coverage=0.0, executed_lines=[], unexecuted_lines=[10], executed_branches=[], unexecuted_branches=[[5]], ), dependent_func_coverage=None, status=CoverageStatus.PARSED_SUCCESSFULLY, ) assert coverage_critic(failing_coverage) is False def test_throughput_gain() -> None: """throughput_gain calculates relative throughput improvement.""" assert ( throughput_gain(original_throughput=100, optimized_throughput=150) == 0.5 ) assert ( throughput_gain(original_throughput=100, optimized_throughput=100) == 0.0 ) assert ( throughput_gain(original_throughput=100, optimized_throughput=80) == -0.2 ) assert ( throughput_gain(original_throughput=0, optimized_throughput=50) == 0.0 ) assert ( throughput_gain(original_throughput=50, optimized_throughput=200) == 3.0 ) def test_speedup_critic_with_async_throughput() -> None: """speedup_critic evaluates async throughput alongside runtime.""" original_code_runtime = 10000 original_async_throughput = 100 # Both runtime and throughput improve significantly candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=8000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=8000, async_throughput=120, ) assert speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, disable_gh_action_noise=True, ) # Runtime improves, throughput below threshold (should pass) candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=8000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=8000, async_throughput=105, ) assert speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, disable_gh_action_noise=True, ) # Throughput improves, runtime below threshold (should pass) candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=9800, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=9800, async_throughput=120, ) assert speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, disable_gh_action_noise=True, ) # No throughput data - falls back to runtime-only candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=8000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=8000, async_throughput=None, ) assert speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=None, best_throughput_until_now=None, disable_gh_action_noise=True, ) # best_throughput_until_now comparison candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=8000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=8000, async_throughput=115, ) assert speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, disable_gh_action_noise=True, ) assert not speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=7000, original_async_throughput=original_async_throughput, best_throughput_until_now=120, disable_gh_action_noise=True, ) # Zero original throughput (edge case) candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=8000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=8000, async_throughput=50, ) assert speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=0, best_throughput_until_now=None, disable_gh_action_noise=True, ) def test_concurrency_gain() -> None: """concurrency_gain measures relative concurrency ratio improvement.""" original = ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=10_000_000, concurrency_factor=10, concurrency_ratio=1.0, ) optimized = ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=1_000_000, concurrency_factor=10, concurrency_ratio=10.0, ) assert concurrency_gain(original, optimized) == 9.0 same = ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=10_000_000, concurrency_factor=10, concurrency_ratio=1.0, ) assert concurrency_gain(original, same) == 0.0 slightly_better = ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=8_000_000, concurrency_factor=10, concurrency_ratio=1.25, ) assert concurrency_gain(original, slightly_better) == 0.25 zero_ratio = ConcurrencyMetrics( sequential_time_ns=0, concurrent_time_ns=1_000_000, concurrency_factor=10, concurrency_ratio=0.0, ) assert concurrency_gain(zero_ratio, optimized) == 0.0 def test_speedup_critic_with_concurrency_metrics() -> None: """speedup_critic accepts candidates with concurrency improvements.""" original_code_runtime = 10000 original_async_throughput = 100 original_concurrency = ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=10_000_000, concurrency_factor=10, concurrency_ratio=1.0, ) # Concurrency improves significantly (blocking -> non-blocking) candidate_result = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=10000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=10000, async_throughput=100, concurrency_metrics=ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=1_000_000, concurrency_factor=10, concurrency_ratio=10.0, ), ) assert speedup_critic( candidate_result=candidate_result, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, original_concurrency_metrics=original_concurrency, best_concurrency_ratio_until_now=None, disable_gh_action_noise=True, ) # No concurrency improvement (falls back to runtime) candidate_result_no_conc = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=8000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=8000, async_throughput=100, concurrency_metrics=ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=10_000_000, concurrency_factor=10, concurrency_ratio=1.0, ), ) assert speedup_critic( candidate_result=candidate_result_no_conc, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, original_concurrency_metrics=original_concurrency, best_concurrency_ratio_until_now=None, disable_gh_action_noise=True, ) # Concurrency below threshold (20% required) candidate_result_below_threshold = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=10000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=10000, async_throughput=100, concurrency_metrics=ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=9_000_000, concurrency_factor=10, concurrency_ratio=1.11, ), ) assert not speedup_critic( candidate_result=candidate_result_below_threshold, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, original_concurrency_metrics=original_concurrency, best_concurrency_ratio_until_now=None, disable_gh_action_noise=True, ) # best_concurrency_ratio_until_now comparison candidate_result_good = OptimizedCandidateResult( max_loop_count=5, best_test_runtime=10000, behavior_test_results=TestResults(), benchmarking_test_results=TestResults(), optimization_candidate_index=0, total_candidate_timing=10000, async_throughput=100, concurrency_metrics=ConcurrencyMetrics( sequential_time_ns=10_000_000, concurrent_time_ns=2_000_000, concurrency_factor=10, concurrency_ratio=5.0, ), ) assert not speedup_critic( candidate_result=candidate_result_good, original_code_runtime=original_code_runtime, best_runtime_until_now=None, original_async_throughput=original_async_throughput, best_throughput_until_now=None, original_concurrency_metrics=original_concurrency, best_concurrency_ratio_until_now=10.0, disable_gh_action_noise=True, ) def test_concurrency_ratio_display_formatting() -> None: """Concurrency ratio display strings are formatted correctly.""" orig_ratio = 0.05 cand_ratio = 0.15 conc_gain = ( ((cand_ratio - orig_ratio) / orig_ratio * 100) if orig_ratio > 0 else 0 ) display_string = ( f"Concurrency ratio: {orig_ratio:.2f}x " f"\u2192 {cand_ratio:.2f}x ({conc_gain:+.1f}%)" ) assert display_string == "Concurrency ratio: 0.05x \u2192 0.15x (+200.0%)" orig_ratio = 1.0 cand_ratio = 10.0 conc_gain = ( ((cand_ratio - orig_ratio) / orig_ratio * 100) if orig_ratio > 0 else 0 ) display_string = ( f"Concurrency ratio: {orig_ratio:.2f}x " f"\u2192 {cand_ratio:.2f}x ({conc_gain:+.1f}%)" ) assert display_string == "Concurrency ratio: 1.00x \u2192 10.00x (+900.0%)" orig_ratio = 0.01 cand_ratio = 0.03 conc_gain = ( ((cand_ratio - orig_ratio) / orig_ratio * 100) if orig_ratio > 0 else 0 ) display_string = ( f"Concurrency ratio: {orig_ratio:.2f}x " f"\u2192 {cand_ratio:.2f}x ({conc_gain:+.1f}%)" ) assert display_string == "Concurrency ratio: 0.01x \u2192 0.03x (+200.0%)"