Merge branch 'main' of github.com:codeflash-ai/codeflash into bootstrapped-benchmarking

This commit is contained in:
RD 2025-01-22 16:27:48 -08:00
commit fe91ece425
10 changed files with 179 additions and 39 deletions

View file

@ -126,7 +126,8 @@ def collect_setup_info() -> SetupInfo:
valid_module_subdirs = [d for d in valid_subdirs if d != "tests"]
curdir_option = f"current directory ({curdir})"
module_subdir_options = [*valid_module_subdirs, curdir_option]
custom_dir_option = "enter a custom directory…"
module_subdir_options = [*valid_module_subdirs, curdir_option, custom_dir_option]
module_root_answer = inquirer_wrapper(
inquirer.list_input,
@ -135,7 +136,20 @@ def collect_setup_info() -> SetupInfo:
choices=module_subdir_options,
default=(project_name if project_name in module_subdir_options else module_subdir_options[0]),
)
module_root = "." if module_root_answer == curdir_option else module_root_answer
if module_root_answer == curdir_option:
module_root = "."
elif module_root_answer == custom_dir_option:
custom_module_root_answer = inquirer_wrapper_path(
"path",
message=f"Enter the path to your module directory inside {Path(curdir).resolve()}{os.path.sep} ",
path_type=inquirer.Path.DIRECTORY,
)
if custom_module_root_answer:
module_root = Path(curdir) / Path(custom_module_root_answer["path"])
else:
apologize_and_exit()
else:
module_root = module_root_answer
ph("cli-project-root-provided")
# Discover test directory
@ -452,7 +466,6 @@ def configure_pyproject_toml(setup_info: SetupInfo) -> None:
tool_section["codeflash"] = codeflash_section
pyproject_data["tool"] = tool_section
click.echo("Writing Codeflash configuration…\r", nl=False)
with toml_path.open("w", encoding="utf8") as pyproject_file:
pyproject_file.write(tomlkit.dumps(pyproject_data))
click.echo(f"✅ Added Codeflash configuration to {toml_path}")

View file

@ -43,8 +43,8 @@ def generate_candidates(source_code_path: Path) -> list[str]:
def prepare_coverage_files() -> tuple[Path, Path]:
"""Prepare coverage configuration and output files."""
coverage_out_file = get_run_tmp_file(Path("coverage.json"))
coverage_database_file = get_run_tmp_file(Path(".coverage"))
coveragercfile = get_run_tmp_file(Path(".coveragerc"))
coveragerc_content = f"[run]\n branch = True\n [json]\n output = {coverage_out_file.as_posix()}\n"
coveragerc_content = f"[run]\n branch = True\ndata_file={coverage_database_file}\n"
coveragercfile.write_text(coveragerc_content)
return coverage_out_file, coveragercfile
return coverage_database_file, coveragercfile

View file

@ -1,4 +1,5 @@
import sys
from typing import Any
# This script should not have any relation to the codeflash package, be careful with imports
cwd = sys.argv[1]
@ -16,8 +17,8 @@ class PytestCollectionPlugin:
pytest_rootdir = session.config.rootdir
def parse_pytest_collection_results(pytest_tests: list[any]) -> list[dict[str, str]]:
test_results: list[list[str]] = []
def parse_pytest_collection_results(pytest_tests: list[Any]) -> list[dict[str, str]]:
test_results = []
for test in pytest_tests:
test_class = None
if test.cls:

View file

@ -215,19 +215,51 @@ class CoverageData:
main_func_coverage: FunctionCoverage
dependent_func_coverage: Union[FunctionCoverage, None]
status: CoverageStatus
blank_re: Pattern = re.compile(r"\s*(#|$)")
else_re: Pattern = re.compile(r"\s*else\s*:\s*(#|$)")
blank_re: Pattern[str] = re.compile(r"\s*(#|$)")
else_re: Pattern[str] = re.compile(r"\s*else\s*:\s*(#|$)")
@staticmethod
def load_from_coverage_file(
coverage_file_path: Path, source_code_path: Path, function_name: str, code_context: CodeOptimizationContext
def load_from_sqlite_database(
database_path: Path, function_name: str, code_context: CodeOptimizationContext, source_code_path: Path
) -> CoverageData:
"""Load coverage data, including main function and its dependencies."""
from json import load
"""Load coverage data from an SQLite database, mimicking the behavior of load_from_coverage_file."""
from coverage import Coverage
from coverage.jsonreport import JsonReporter
cov = Coverage(data_file=database_path, data_suffix=True, auto_data=True, branch=True)
if not database_path.stat().st_size or not database_path.exists():
logger.debug(f"Coverage database {database_path} is empty or does not exist")
return CoverageData(
file_path=source_code_path,
coverage=0.0,
function_name=function_name,
functions_being_tested=[],
graph={},
code_context=code_context,
main_func_coverage=FunctionCoverage(
name=function_name,
coverage=0.0,
executed_lines=[],
unexecuted_lines=[],
executed_branches=[],
unexecuted_branches=[],
),
dependent_func_coverage=None,
status=CoverageStatus.NOT_FOUND,
)
cov.load()
reporter = JsonReporter(cov)
temp_json_file = database_path.with_suffix(".report.json")
with temp_json_file.open("w") as f:
reporter.report(morfs=[source_code_path.as_posix()], outfile=f)
with temp_json_file.open() as f:
original_coverage_data = json.load(f)
coverage_data, status = CoverageData._parse_coverage_file(temp_json_file, source_code_path)
with coverage_file_path.open() as f:
original_coverage_data = load(f) # we can remove this once we're done debugging
coverage_data, status = CoverageData._parse_coverage_file(coverage_file_path, source_code_path)
main_func_coverage, dependent_func_coverage = CoverageData._fetch_function_coverages(
function_name, code_context, coverage_data, original_cov_data=original_coverage_data
)
@ -245,6 +277,8 @@ class CoverageData:
functions_being_tested.append(dependent_func_coverage.name)
graph = CoverageData._build_graph(main_func_coverage, dependent_func_coverage)
temp_json_file.unlink()
return CoverageData(
file_path=source_code_path,
coverage=coverage,

View file

@ -1167,10 +1167,10 @@ class Optimizer:
code_context: CodeOptimizationContext | None = None,
unittest_loop_index: int | None = None,
) -> tuple[TestResults, CoverageData | None]:
coverage_out_file = None
coverage_database_file = None
try:
if testing_type == TestingMode.BEHAVIOR:
result_file_path, run_result, coverage_out_file = run_behavioral_tests(
result_file_path, run_result, coverage_database_file = run_behavioral_tests(
test_files,
test_framework=self.args.test_framework,
cwd=self.args.project_root,
@ -1217,7 +1217,7 @@ class Optimizer:
function_name=function_name,
source_file=source_file,
code_context=code_context,
coverage_file=coverage_out_file,
coverage_database_file=coverage_database_file,
)
return results, coverage_results

View file

@ -436,7 +436,7 @@ def parse_test_results(
optimization_iteration: int,
function_name: str | None,
source_file: Path | None,
coverage_file: Path | None,
coverage_database_file: Path | None,
code_context: CodeOptimizationContext | None = None,
run_result: subprocess.CompletedProcess | None = None,
unittest_loop_index: int | None = None,
@ -478,15 +478,13 @@ def parse_test_results(
results = merge_test_results(test_results_xml, test_results_bin_file, test_config.test_framework)
all_args = False
if coverage_file and coverage_file.exists() and source_file and code_context and function_name:
if coverage_database_file and source_file and code_context and function_name:
all_args = True
coverage = CoverageData.load_from_coverage_file(
coverage_file_path=coverage_file,
coverage = CoverageData.load_from_sqlite_database(
database_path=coverage_database_file,
source_code_path=source_file,
code_context=code_context,
function_name=function_name,
)
coverage_file.unlink(missing_ok=True)
Path(".coverage").unlink(missing_ok=True)
coverage.log_coverage()
return results, coverage if all_args else None

View file

@ -69,7 +69,7 @@ def run_behavioral_tests(
pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"
if enable_coverage:
coverage_out_file, coveragercfile = prepare_coverage_files()
coverage_database_file, coveragercfile = prepare_coverage_files()
cov_erase = execute_test_subprocess(
shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage erase"), cwd=cwd, env=pytest_test_env
@ -88,17 +88,6 @@ def run_behavioral_tests(
timeout=600,
)
logger.debug(results)
cov_report = execute_test_subprocess(
shlex.split(f"{SAFE_SYS_EXECUTABLE} -m coverage json --rcfile={coveragercfile.as_posix()}"),
cwd=cwd,
env=pytest_test_env,
) # this will generate a json file with the coverage data
logger.debug(cov_report)
if "No data to report." in cov_report.stdout:
logger.warning("No coverage data to report. Check if the tests are running correctly.")
console.rule()
coverage_out_file = None
else:
results = execute_test_subprocess(
pytest_cmd_list + common_pytest_args + result_args + test_files,
@ -115,7 +104,7 @@ def run_behavioral_tests(
else:
raise ValueError(f"Unsupported test framework: {test_framework}")
return result_file_path, results, coverage_out_file if enable_coverage else None
return result_file_path, results, coverage_database_file if enable_coverage else None
def run_benchmarking_tests(

View file

@ -1,6 +1,8 @@
import ast
import site
from collections.abc import Generator
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
@ -16,6 +18,7 @@ from codeflash.code_utils.code_utils import (
module_name_from_file_path,
path_belongs_to_site_packages,
)
from codeflash.code_utils.coverage_utils import generate_candidates, prepare_coverage_files
@pytest.fixture
@ -26,6 +29,13 @@ def multiple_existing_and_non_existing_files(tmp_path: Path) -> list[Path]:
file.touch()
return existing_files + non_existing_files
@pytest.fixture
def mock_get_run_tmp_file() -> Generator[MagicMock, None, None]:
with patch("codeflash.code_utils.coverage_utils.get_run_tmp_file") as mock:
yield mock
def test_get_qualified_name_valid() -> None:
module_name = "codeflash"
full_qualified_name = "codeflash.utils.module"
@ -47,6 +57,7 @@ def test_get_qualified_name_same_name() -> None:
with pytest.raises(ValueError, match="is the same as codeflash"):
get_qualified_name(module_name, full_qualified_name)
# tests for module_name_from_file_path
def test_module_name_from_file_path() -> None:
project_root_path = Path("/Users/codeflashuser/PycharmProjects/codeflash")
@ -91,6 +102,8 @@ def test_get_imports_from_file_with_file_path(tmp_path: Path) -> None:
assert imports[0].names[0].name == "os"
assert imports[1].module == "sys"
assert imports[1].names[0].name == "path"
def test_get_imports_from_file_with_file_string() -> None:
file_string = "import os\nfrom sys import path\n"
@ -102,6 +115,7 @@ def test_get_imports_from_file_with_file_string() -> None:
assert imports[1].module == "sys"
assert imports[1].names[0].name == "path"
def test_get_imports_from_file_with_file_ast() -> None:
file_string = "import os\nfrom sys import path\n"
file_ast = ast.parse(file_string)
@ -114,6 +128,7 @@ def test_get_imports_from_file_with_file_ast() -> None:
assert imports[1].module == "sys"
assert imports[1].names[0].name == "path"
def test_get_imports_from_file_with_syntax_error(caplog: pytest.LogCaptureFixture) -> None:
file_string = "import os\nfrom sys import path\ninvalid syntax"
@ -173,6 +188,7 @@ async def bar():
assert success is True
assert function_names == ["foo", "bar"]
def test_get_all_function_names_with_syntax_error(caplog: pytest.LogCaptureFixture) -> None:
code = """
def foo():
@ -234,6 +250,7 @@ def test_get_run_tmp_file_reuses_temp_directory() -> None:
assert tmp_file_path1.parent.name.startswith("codeflash_")
assert tmp_file_path1.parent.exists()
def test_path_belongs_to_site_packages_with_site_package_path(monkeypatch: pytest.MonkeyPatch) -> None:
site_packages = [Path("/usr/local/lib/python3.9/site-packages")]
monkeypatch.setattr(site, "getsitepackages", lambda: site_packages)
@ -241,6 +258,7 @@ def test_path_belongs_to_site_packages_with_site_package_path(monkeypatch: pytes
file_path = Path("/usr/local/lib/python3.9/site-packages/some_package")
assert path_belongs_to_site_packages(file_path) is True
def test_path_belongs_to_site_packages_with_non_site_package_path(monkeypatch: pytest.MonkeyPatch) -> None:
site_packages = [Path("/usr/local/lib/python3.9/site-packages")]
monkeypatch.setattr(site, "getsitepackages", lambda: site_packages)
@ -248,6 +266,7 @@ def test_path_belongs_to_site_packages_with_non_site_package_path(monkeypatch: p
file_path = Path("/usr/local/lib/python3.9/other_directory/some_package")
assert path_belongs_to_site_packages(file_path) is False
def test_path_belongs_to_site_packages_with_relative_path(monkeypatch: pytest.MonkeyPatch) -> None:
site_packages = [Path("/usr/local/lib/python3.9/site-packages")]
monkeypatch.setattr(site, "getsitepackages", lambda: site_packages)
@ -332,3 +351,30 @@ def test_cleanup_paths(multiple_existing_and_non_existing_files: list[Path]) ->
cleanup_paths(multiple_existing_and_non_existing_files)
for file in multiple_existing_and_non_existing_files:
assert not file.exists()
def test_generate_candidates() -> None:
source_code_path = Path("/Users/krrt7/Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py")
expected_candidates = [
"coverage_utils.py",
"code_utils/coverage_utils.py",
"codeflash/code_utils/coverage_utils.py",
"cli/codeflash/code_utils/coverage_utils.py",
"codeflash/cli/codeflash/code_utils/coverage_utils.py",
"work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
"Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
"krrt7/Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
"Users/krrt7/Desktop/work/codeflash/cli/codeflash/code_utils/coverage_utils.py",
]
assert generate_candidates(source_code_path) == expected_candidates
def test_prepare_coverage_files(mock_get_run_tmp_file: MagicMock) -> None:
mock_coverage_file = MagicMock(spec=Path)
mock_coveragerc_file = MagicMock(spec=Path)
mock_get_run_tmp_file.side_effect = [mock_coverage_file, mock_coveragerc_file]
coverage_database_file, coveragercfile = prepare_coverage_files()
assert coverage_database_file == mock_coverage_file
assert coveragercfile == mock_coveragerc_file
mock_coveragerc_file.write_text.assert_called_once_with(f"[run]\n branch = True\ndata_file={mock_coverage_file}\n")

View file

@ -3,3 +3,61 @@ sidebar_position: 4
---
# How Codeflash Works
Codeflash follows a "generate and verify" approach to optimize code. It uses LLMs to generate optimizations, then it rigorously verifies if those optimizations are indeed
faster and if they have the same behavior. The basic unit of optimization is a function—Codeflash tries to speed up the function, and tries to ensure that it still behaves the same way. This way if you merge the optimized code, it simply runs faster without breaking any functionality.
## Analysis of your code
Codeflash scans your codebase to identify all available functions. It locates existing unit tests in your projects and maps which functions they test. When optimizing a function, Codeflash runs these discovered tests to verify nothing has broken.
#### What kind of functions can Codeflash optimize?
Codeflash works best with self-contained functions that have minimal side effects (like communicating with external systems or sending network requests). Codeflash optimizes a group of functions - consisting of an entry point function and any other functions it directly calls.
Currently, Codeflash cannot optimize async functions.
#### Test Discovery
Codeflash currently only runs tests that directly call the target function in their test body. To discover tests that indirectly call the function, you can use the Codeflash Tracer. The Tracer analyzes your test suite and identifies all tests that eventually call a function.
## Optimization Generation
To optimize code, Codeflash first gathers all necessary context from the codebase. It then calls our backend to generate several candidate optimizations. These are called "candidates" because their speed and correctness haven't been verified yet. Both properties will be verified in later steps.
## Verification of correctness
![Verification](/img/verification.svg)
The goal of correctness verification is to ensure that when the original code is replaced by the new code, there are no behavioral changes in the code and the rest of the system. This means the replacement should be completely safe.
To verify correctness, Codeflash calls the function with numerous inputs, confirming that the new function behaves identically to the original.
Codeflash verifies these specific behaviors to be correct -
- function return values match exactly
- inputs to function have been mutated exactly the same way as before
- exception types remain consistent
Additionally, Codeflash checks for sufficient line coverage of the optimized code, increasing confidence in the testing process.
Codeflash also evaluates that there is sufficient line coverage of the code under optimization. This provides more confidence with testing.
We recommend manually reviewing the optimized code, since there might be important input cases that we havent verified where the behavior could differ.
#### Test Generation
Codeflash generates two types of tests:
- LLM Generated tests - Codeflash uses LLMs to create several regression test cases that cover typical function usage, edge cases, and large-scale inputs to verify both correctness and performance.
- Concolic coverage tests - Codeflash uses state-of-the-art concolic testing with an SMT Solver (a theorem prover) to explore execution paths and generate function arguments. This aims to maximize code coverage for the function being optimized. Codeflash runs the resulting test file to verify correctness. Currently, this feature only supports pytest.
## Code Execution
Codeflash runs tests for the target function using either pytest or unittest frameworks. The tests execute on your machine, ensuring access to the Python environment and any other dependencies associated to let Codeflash run your code properly. Running on your machine also ensures accurate performance measurements since runtime varies by system.
#### Performance benchmarking
Codeflash implements several techniques to measure code performance accurately. In particular, it runs multiple iterations of the code in a loop to determine the best performance with the minimum runtime. Codeflash compares performance of the original code against the optimization, requiring at least a 10% speed improvement before considering it faster. This approach eliminates most runtime measurement variability, even on noisy CI systems and virtual machines. The final runtime Codeflash reports is the minimum total time it took to run all the test cases.
## Creating Pull Requests
Once an optimization passes all checks, Codeflash creates a pull request through the Codeflash GitHub app directly in your repository. The pull request includes the new code, the speedup percentage, an explanation of the optimization, test statistics including coverage, and the test content itself. You can review and merge the new code if it meets your standards. Feel free to modify the code as needed—we welcome your improvements!

1
docs/static/img/verification.svg vendored Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 24 KiB