Merge branch 'main' of github.com:codeflash-ai/codeflash into bootstrapped-benchmarking

2026-05-04 18:25:18 +00:00 · 2025-01-22 16:27:48 -08:00 · 2025-01-22 16:27:48 -08:00 · fe91ece425
commit fe91ece425
parent 4d585c1d55 f95863daff
10 changed files with 179 additions and 39 deletions
--- a/cli/codeflash/cli_cmds/cmd_init.py
+++ b/cli/codeflash/cli_cmds/cmd_init.py
@ -126,7 +126,8 @@ def collect_setup_info() -> SetupInfo:
    valid_module_subdirs = [d for d in valid_subdirs if d != "tests"]

    curdir_option = f"current directory ({curdir})"
-    module_subdir_options = [*valid_module_subdirs, curdir_option]
+    custom_dir_option = "enter a custom directory…"
+    module_subdir_options = [*valid_module_subdirs, curdir_option, custom_dir_option]

    module_root_answer = inquirer_wrapper(
        inquirer.list_input,
@ -135,7 +136,20 @@ def collect_setup_info() -> SetupInfo:
        choices=module_subdir_options,
        default=(project_name if project_name in module_subdir_options else module_subdir_options[0]),
    )
-    module_root = "." if module_root_answer == curdir_option else module_root_answer
+    if module_root_answer == curdir_option:
+        module_root = "."
+    elif module_root_answer == custom_dir_option:
+        custom_module_root_answer = inquirer_wrapper_path(
+            "path",
+            message=f"Enter the path to your module directory inside {Path(curdir).resolve()}{os.path.sep} ",
+            path_type=inquirer.Path.DIRECTORY,
+        )
+        if custom_module_root_answer:
+            module_root = Path(curdir) / Path(custom_module_root_answer["path"])
+        else:
+            apologize_and_exit()
+    else:
+        module_root = module_root_answer
    ph("cli-project-root-provided")

    # Discover test directory
@ -452,7 +466,6 @@ def configure_pyproject_toml(setup_info: SetupInfo) -> None:
    tool_section["codeflash"] = codeflash_section
    pyproject_data["tool"] = tool_section

-    click.echo("Writing Codeflash configuration…\r", nl=False)
    with toml_path.open("w", encoding="utf8") as pyproject_file:
        pyproject_file.write(tomlkit.dumps(pyproject_data))
    click.echo(f"✅ Added Codeflash configuration to {toml_path}")
--- a/cli/codeflash/code_utils/coverage_utils.py
+++ b/cli/codeflash/code_utils/coverage_utils.py
@ -43,8 +43,8 @@ def generate_candidates(source_code_path: Path) -> list[str]:

 def prepare_coverage_files() -> tuple[Path, Path]:
    """Prepare coverage configuration and output files."""
-    coverage_out_file = get_run_tmp_file(Path("coverage.json"))
+    coverage_database_file = get_run_tmp_file(Path(".coverage"))
    coveragercfile = get_run_tmp_file(Path(".coveragerc"))
-    coveragerc_content = f"[run]\n branch = True\n [json]\n output = {coverage_out_file.as_posix()}\n"
+    coveragerc_content = f"[run]\n branch = True\ndata_file={coverage_database_file}\n"
    coveragercfile.write_text(coveragerc_content)
-    return coverage_out_file, coveragercfile
+    return coverage_database_file, coveragercfile
--- a/cli/codeflash/discovery/pytest_new_process_discovery.py
+++ b/cli/codeflash/discovery/pytest_new_process_discovery.py
@ -1,4 +1,5 @@
 import sys
+from typing import Any

 # This script should not have any relation to the codeflash package, be careful with imports
 cwd = sys.argv[1]
@ -16,8 +17,8 @@ class PytestCollectionPlugin:
        pytest_rootdir = session.config.rootdir


-def parse_pytest_collection_results(pytest_tests: list[any]) -> list[dict[str, str]]:
-    test_results: list[list[str]] = []
+def parse_pytest_collection_results(pytest_tests: list[Any]) -> list[dict[str, str]]:
+    test_results = []
    for test in pytest_tests:
        test_class = None
        if test.cls:
--- a/cli/codeflash/models/models.py
+++ b/cli/codeflash/models/models.py
@ -215,19 +215,51 @@ class CoverageData:
    main_func_coverage: FunctionCoverage
    dependent_func_coverage: Union[FunctionCoverage, None]
    status: CoverageStatus
-    blank_re: Pattern = re.compile(r"\s*(#|$)")
-    else_re: Pattern = re.compile(r"\s*else\s*:\s*(#|$)")
+    blank_re: Pattern[str] = re.compile(r"\s*(#|$)")
+    else_re: Pattern[str] = re.compile(r"\s*else\s*:\s*(#|$)")

    @staticmethod
-    def load_from_coverage_file(
-        coverage_file_path: Path, source_code_path: Path, function_name: str, code_context: CodeOptimizationContext
+    def load_from_sqlite_database(
+        database_path: Path, function_name: str, code_context: CodeOptimizationContext, source_code_path: Path
    ) -> CoverageData:
-        """Load coverage data, including main function and its dependencies."""
-        from json import load
+        """Load coverage data from an SQLite database, mimicking the behavior of load_from_coverage_file."""
+        from coverage import Coverage
+        from coverage.jsonreport import JsonReporter
+
+        cov = Coverage(data_file=database_path, data_suffix=True, auto_data=True, branch=True)
+        if not database_path.stat().st_size or not database_path.exists():
+            logger.debug(f"Coverage database {database_path} is empty or does not exist")
+            return CoverageData(
+                file_path=source_code_path,
+                coverage=0.0,
+                function_name=function_name,
+                functions_being_tested=[],
+                graph={},
+                code_context=code_context,
+                main_func_coverage=FunctionCoverage(
+                    name=function_name,
+                    coverage=0.0,
+                    executed_lines=[],
+                    unexecuted_lines=[],
+                    executed_branches=[],
+                    unexecuted_branches=[],
+                ),
+                dependent_func_coverage=None,
+                status=CoverageStatus.NOT_FOUND,
+            )
+
+        cov.load()
+
+        reporter = JsonReporter(cov)
+        temp_json_file = database_path.with_suffix(".report.json")
+        with temp_json_file.open("w") as f:
+            reporter.report(morfs=[source_code_path.as_posix()], outfile=f)
+
+        with temp_json_file.open() as f:
+            original_coverage_data = json.load(f)
+
+        coverage_data, status = CoverageData._parse_coverage_file(temp_json_file, source_code_path)

-        with coverage_file_path.open() as f:
-            original_coverage_data = load(f)  # we can remove this once we're done debugging
-        coverage_data, status = CoverageData._parse_coverage_file(coverage_file_path, source_code_path)
        main_func_coverage, dependent_func_coverage = CoverageData._fetch_function_coverages(
            function_name, code_context, coverage_data, original_cov_data=original_coverage_data
        )
@ -245,6 +277,8 @@ class CoverageData:
            functions_being_tested.append(dependent_func_coverage.name)

        graph = CoverageData._build_graph(main_func_coverage, dependent_func_coverage)
+        temp_json_file.unlink()
+
        return CoverageData(
            file_path=source_code_path,
            coverage=coverage,
--- a/cli/codeflash/optimization/optimizer.py
+++ b/cli/codeflash/optimization/optimizer.py
@ -1167,10 +1167,10 @@ class Optimizer:
        code_context: CodeOptimizationContext | None = None,
        unittest_loop_index: int | None = None,
    ) -> tuple[TestResults, CoverageData | None]:
-        coverage_out_file = None
+        coverage_database_file = None
        try:
            if testing_type == TestingMode.BEHAVIOR:
-                result_file_path, run_result, coverage_out_file = run_behavioral_tests(
+                result_file_path, run_result, coverage_database_file = run_behavioral_tests(
                    test_files,
                    test_framework=self.args.test_framework,
                    cwd=self.args.project_root,
@ -1217,7 +1217,7 @@ class Optimizer:
            function_name=function_name,
            source_file=source_file,
            code_context=code_context,
-            coverage_file=coverage_out_file,
+            coverage_database_file=coverage_database_file,
        )
        return results, coverage_results

--- a/cli/codeflash/verification/parse_test_output.py
+++ b/cli/codeflash/verification/parse_test_output.py
@ -436,7 +436,7 @@ def parse_test_results(
    optimization_iteration: int,
    function_name: str | None,
    source_file: Path | None,
-    coverage_file: Path | None,
+    coverage_database_file: Path | None,
    code_context: CodeOptimizationContext | None = None,
    run_result: subprocess.CompletedProcess | None = None,
    unittest_loop_index: int | None = None,
@ -478,15 +478,13 @@ def parse_test_results(
    results = merge_test_results(test_results_xml, test_results_bin_file, test_config.test_framework)

    all_args = False
-    if coverage_file and coverage_file.exists() and source_file and code_context and function_name:
+    if coverage_database_file and source_file and code_context and function_name:
        all_args = True
-        coverage = CoverageData.load_from_coverage_file(
-            coverage_file_path=coverage_file,
+        coverage = CoverageData.load_from_sqlite_database(
+            database_path=coverage_database_file,
            source_code_path=source_file,
            code_context=code_context,
            function_name=function_name,
        )
-        coverage_file.unlink(missing_ok=True)
-        Path(".coverage").unlink(missing_ok=True)
        coverage.log_coverage()
    return results, coverage if all_args else None
--- a/cli/codeflash/verification/test_runner.py
+++ b/cli/codeflash/verification/test_runner.py
@ -69,7 +69,7 @@ def run_behavioral_tests(
        pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"

        if enable_coverage:
-            coverage_out_file, coveragercfile = prepare_coverage_files()
+            coverage_database_file, coveragercfile = prepare_coverage_files()

            cov_erase = execute_test_subprocess(
                shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage erase"), cwd=cwd, env=pytest_test_env
@ -88,17 +88,6 @@ def run_behavioral_tests(
                timeout=600,
            )
            logger.debug(results)
-
-            cov_report = execute_test_subprocess(
-                shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage json --rcfile={coveragercfile.as_posix()}"),
-                cwd=cwd,
-                env=pytest_test_env,
-            )  # this will generate a json file with the coverage data
-            logger.debug(cov_report)
-            if "No data to report." in cov_report.stdout:
-                logger.warning("No coverage data to report. Check if the tests are running correctly.")
-                console.rule()
-                coverage_out_file = None
        else:
            results = execute_test_subprocess(
                pytest_cmd_list + common_pytest_args + result_args + test_files,
@ -115,7 +104,7 @@ def run_behavioral_tests(
    else:
        raise ValueError(f"Unsupported test framework: {test_framework}")

-    return result_file_path, results, coverage_out_file if enable_coverage else None
+    return result_file_path, results, coverage_database_file if enable_coverage else None


 def run_benchmarking_tests(
--- a/cli/tests/test_code_utils.py
+++ b/cli/tests/test_code_utils.py
@ -1,6 +1,8 @@
 import ast
 import site
+from collections.abc import Generator
 from pathlib import Path
+from unittest.mock import MagicMock, patch

 import pytest

@ -16,6 +18,7 @@ from codeflash.code_utils.code_utils import (
    module_name_from_file_path,
    path_belongs_to_site_packages,
 )
+from codeflash.code_utils.coverage_utils import generate_candidates, prepare_coverage_files


@pytest.fixture
@ -26,6 +29,13 @@ def multiple_existing_and_non_existing_files(tmp_path: Path) -> list[Path]:
        file.touch()
    return existing_files + non_existing_files

+
+@pytest.fixture
+def mock_get_run_tmp_file() -> Generator[MagicMock, None, None]:
+    with patch("codeflash.code_utils.coverage_utils.get_run_tmp_file") as mock:
+        yield mock
+
+
 def test_get_qualified_name_valid() -> None:
    module_name = "codeflash"
    full_qualified_name = "codeflash.utils.module"
@ -47,6 +57,7 @@ def test_get_qualified_name_same_name() -> None:
    with pytest.raises(ValueError, match="is the same as codeflash"):
        get_qualified_name(module_name, full_qualified_name)

+
 # tests for module_name_from_file_path
 def test_module_name_from_file_path() -> None:
    project_root_path = Path("/Users/codeflashuser/PycharmProjects/codeflash")
@ -91,6 +102,8 @@ def test_get_imports_from_file_with_file_path(tmp_path: Path) -> None:
    assert imports[0].names[0].name == "os"
    assert imports[1].module == "sys"
    assert imports[1].names[0].name == "path"
+
+
 def test_get_imports_from_file_with_file_string() -> None:
    file_string = "import os\nfrom sys import path\n"

@ -102,6 +115,7 @@ def test_get_imports_from_file_with_file_string() -> None:
    assert imports[1].module == "sys"
    assert imports[1].names[0].name == "path"

+
 def test_get_imports_from_file_with_file_ast() -> None:
    file_string = "import os\nfrom sys import path\n"
    file_ast = ast.parse(file_string)
@ -114,6 +128,7 @@ def test_get_imports_from_file_with_file_ast() -> None:
    assert imports[1].module == "sys"
    assert imports[1].names[0].name == "path"

+
 def test_get_imports_from_file_with_syntax_error(caplog: pytest.LogCaptureFixture) -> None:
    file_string = "import os\nfrom sys import path\ninvalid syntax"

@ -173,6 +188,7 @@ async def bar():
    assert success is True
    assert function_names == ["foo", "bar"]

+
 def test_get_all_function_names_with_syntax_error(caplog: pytest.LogCaptureFixture) -> None:
    code = """
 def foo():
@ -234,6 +250,7 @@ def test_get_run_tmp_file_reuses_temp_directory() -> None:
    assert tmp_file_path1.parent.name.startswith("codeflash_")
    assert tmp_file_path1.parent.exists()

+
 def test_path_belongs_to_site_packages_with_site_package_path(monkeypatch: pytest.MonkeyPatch) -> None:
    site_packages = [Path("/usr/local/lib/python3.9/site-packages")]
    monkeypatch.setattr(site, "getsitepackages", lambda: site_packages)
@ -241,6 +258,7 @@ def test_path_belongs_to_site_packages_with_site_package_path(monkeypatch: pytes
    file_path = Path("/usr/local/lib/python3.9/site-packages/some_package")
    assert path_belongs_to_site_packages(file_path) is True

+
 def test_path_belongs_to_site_packages_with_non_site_package_path(monkeypatch: pytest.MonkeyPatch) -> None:
    site_packages = [Path("/usr/local/lib/python3.9/site-packages")]
    monkeypatch.setattr(site, "getsitepackages", lambda: site_packages)
@ -248,6 +266,7 @@ def test_path_belongs_to_site_packages_with_non_site_package_path(monkeypatch: p
    file_path = Path("/usr/local/lib/python3.9/other_directory/some_package")
    assert path_belongs_to_site_packages(file_path) is False

+
 def test_path_belongs_to_site_packages_with_relative_path(monkeypatch: pytest.MonkeyPatch) -> None:
    site_packages = [Path("/usr/local/lib/python3.9/site-packages")]
    monkeypatch.setattr(site, "getsitepackages", lambda: site_packages)
@ -332,3 +351,30 @@ def test_cleanup_paths(multiple_existing_and_non_existing_files: list[Path]) ->
    cleanup_paths(multiple_existing_and_non_existing_files)
    for file in multiple_existing_and_non_existing_files:
        assert not file.exists()
+
+
+def test_generate_candidates() -> None:
+    source_code_path = Path("/Users/krrt7/Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py")
+    expected_candidates = [
+        "coverage_utils.py",
+        "code_utils/coverage_utils.py",
+        "codeflash/code_utils/coverage_utils.py",
+        "cli/codeflash/code_utils/coverage_utils.py",
+        "codeflash/cli/codeflash/code_utils/coverage_utils.py",
+        "work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
+        "Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
+        "krrt7/Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
+        "Users/krrt7/Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
+    ]
+    assert generate_candidates(source_code_path) == expected_candidates
+
+
+def test_prepare_coverage_files(mock_get_run_tmp_file: MagicMock) -> None:
+    mock_coverage_file = MagicMock(spec=Path)
+    mock_coveragerc_file = MagicMock(spec=Path)
+    mock_get_run_tmp_file.side_effect = [mock_coverage_file, mock_coveragerc_file]
+
+    coverage_database_file, coveragercfile = prepare_coverage_files()
+    assert coverage_database_file == mock_coverage_file
+    assert coveragercfile == mock_coveragerc_file
+    mock_coveragerc_file.write_text.assert_called_once_with(f"[run]\n branch = True\ndata_file={mock_coverage_file}\n")
--- a/docs/docs/how-codeflash-works.md
+++ b/docs/docs/how-codeflash-works.md
@ -3,3 +3,61 @@ sidebar_position: 4
 ---
 # How Codeflash Works

+Codeflash follows a "generate and verify" approach to optimize code. It uses LLMs to generate optimizations, then it rigorously verifies if those optimizations are indeed
+faster and if they have the same behavior. The basic unit of optimization is a function—Codeflash tries to speed up the function, and tries to ensure that it still behaves the same way. This way if you merge the optimized code, it simply runs faster without breaking any functionality.
+
+## Analysis of your code
+
+Codeflash scans your codebase to identify all available functions. It locates existing unit tests in your projects and maps which functions they test. When optimizing a function, Codeflash runs these discovered tests to verify nothing has broken.
+
+#### What kind of functions can Codeflash optimize?
+
+Codeflash works best with self-contained functions that have minimal side effects (like communicating with external systems or sending network requests). Codeflash optimizes a group of functions - consisting of an entry point function and any other functions it directly calls.
+Currently, Codeflash cannot optimize async functions.
+
+#### Test Discovery
+
+Codeflash currently only runs tests that directly call the target function in their test body. To discover tests that indirectly call the function, you can use the Codeflash Tracer. The Tracer analyzes your test suite and identifies all tests that eventually call a function.
+
+## Optimization Generation
+
+To optimize code, Codeflash first gathers all necessary context from the codebase. It then calls our backend to generate several candidate optimizations. These are called "candidates" because their speed and correctness haven't been verified yet. Both properties will be verified in later steps.
+
+## Verification of correctness
+
+![Verification](/img/verification.svg)
+
+The goal of correctness verification is to ensure that when the original code is replaced by the new code, there are no behavioral changes in the code and the rest of the system. This means the replacement should be completely safe.
+
+To verify correctness, Codeflash calls the function with numerous inputs, confirming that the new function behaves identically to the original.
+
+Codeflash verifies these specific behaviors to be correct -
+
+- function return values match exactly
+- inputs to function have been mutated exactly the same way as before
+- exception types remain consistent
+
+Additionally, Codeflash checks for sufficient line coverage of the optimized code, increasing confidence in the testing process.
+
+Codeflash also evaluates that there is sufficient line coverage of the code under optimization. This provides more confidence with testing.
+
+We recommend manually reviewing the optimized code, since there might be important input cases that we haven’t verified where the behavior could differ.
+
+#### Test Generation
+
+Codeflash generates two types of tests:
+
+- LLM Generated tests - Codeflash uses LLMs to create several regression test cases that cover typical function usage, edge cases, and large-scale inputs to verify both correctness and performance.
+- Concolic coverage tests - Codeflash uses state-of-the-art concolic testing with an SMT Solver (a theorem prover) to explore execution paths and generate function arguments. This aims to maximize code coverage for the function being optimized. Codeflash runs the resulting test file to verify correctness. Currently, this feature only supports pytest.
+
+## Code Execution
+
+Codeflash runs tests for the target function using either pytest or unittest frameworks. The tests execute on your machine, ensuring access to the Python environment and any other dependencies associated to let Codeflash run your code properly. Running on your machine also ensures accurate performance measurements since runtime varies by system.
+
+#### Performance benchmarking
+
+Codeflash implements several techniques to measure code performance accurately. In particular, it runs multiple iterations of the code in a loop to determine the best performance with the minimum runtime. Codeflash compares performance of the original code against the optimization, requiring at least a 10% speed improvement before considering it faster. This approach eliminates most runtime measurement variability, even on noisy CI systems and virtual machines. The final runtime Codeflash reports is the minimum total time it took to run all the test cases.
+
+## Creating Pull Requests
+
+Once an optimization passes all checks, Codeflash creates a pull request through the Codeflash GitHub app directly in your repository. The pull request includes the new code, the speedup percentage, an explanation of the optimization, test statistics including coverage, and the test content itself. You can review and merge the new code if it meets your standards. Feel free to modify the code as needed—we welcome your improvements!
--- a/docs/static/img/verification.svg
+++ b/docs/static/img/verification.svg