Merge remote-tracking branch 'origin/main' into call-graphee

# Conflicts:
#	.codex/skills/.gitignore
#	.gemini/skills/.gitignore
#	codeflash/languages/python/context/code_context_extractor.py
This commit is contained in:
Kevin Turcios 2026-02-19 01:05:42 -05:00
commit 2652e71617
82 changed files with 869 additions and 2346 deletions

4
.codex/config.toml Normal file
View file

@ -0,0 +1,4 @@
[mcp_servers.tessl]
type = "stdio"
command = "tessl"
args = [ "mcp", "start" ]

View file

@ -1,3 +0,0 @@
# Managed by Tessl
tessl:*
tessl__*

12
.gemini/settings.json Normal file
View file

@ -0,0 +1,12 @@
{
"mcpServers": {
"tessl": {
"type": "stdio",
"command": "tessl",
"args": [
"mcp",
"start"
]
}
}
}

View file

@ -1,3 +0,0 @@
# Managed by Tessl
tessl:*
tessl__*

View file

@ -128,7 +128,7 @@ def determine_js_package_manager(project_root: Path) -> JsPackageManager:
"""
# Search from project_root up to filesystem root for lock files
# This supports monorepo setups where lock file is at workspace root
current_dir = project_root.resolve()
current_dir = project_root
while current_dir != current_dir.parent:
if (current_dir / "bun.lockb").exists() or (current_dir / "bun.lock").exists():
return JsPackageManager.BUN
@ -161,7 +161,7 @@ def find_node_modules_with_package(project_root: Path, package_name: str) -> Pat
Path to the node_modules directory containing the package, or None if not found.
"""
current_dir = project_root.resolve()
current_dir = project_root
while current_dir != current_dir.parent:
node_modules = current_dir / "node_modules"
if node_modules.exists():

View file

@ -709,6 +709,7 @@ def inject_profiling_into_existing_test(
tests_project_root: Path,
mode: TestingMode = TestingMode.BEHAVIOR,
) -> tuple[bool, str | None]:
tests_project_root = tests_project_root.resolve()
if function_to_optimize.is_async:
return inject_async_profiling_into_existing_test(
test_path, call_positions, function_to_optimize, tests_project_root, mode

View file

@ -69,8 +69,8 @@ FUNCTION_NAME_REGEX = re.compile(r"([^.]+)\.([a-zA-Z0-9_]+)$")
class TestsCache:
SCHEMA_VERSION = 1 # Increment this when schema changes
def __init__(self, project_root_path: str | Path) -> None:
self.project_root_path = Path(project_root_path).resolve().as_posix()
def __init__(self, project_root_path: Path) -> None:
self.project_root_path = project_root_path.resolve().as_posix()
self.connection = sqlite3.connect(codeflash_cache_db)
self.cur = self.connection.cursor()

View file

@ -144,6 +144,27 @@ def find_functions_with_return_statement(ast_module: ast.Module, file_path: Path
# Multi-language support helpers
# =============================================================================
_VCS_EXCLUDES = frozenset({".git", ".hg", ".svn"})
def parse_dir_excludes(patterns: frozenset[str]) -> tuple[frozenset[str], tuple[str, ...], tuple[str, ...]]:
"""Split glob patterns into exact names, prefixes, and suffixes.
Patterns ending with ``*`` become prefix matches, patterns starting with ``*``
become suffix matches, and plain strings become exact matches.
"""
exact: set[str] = set()
prefixes: list[str] = []
suffixes: list[str] = []
for p in patterns:
if p.endswith("*"):
prefixes.append(p[:-1])
elif p.startswith("*"):
suffixes.append(p[1:])
else:
exact.add(p)
return frozenset(exact), tuple(prefixes), tuple(suffixes)
def get_files_for_language(
module_root_path: Path, ignore_paths: list[Path] | None = None, language: Language | None = None
@ -162,37 +183,44 @@ def get_files_for_language(
if ignore_paths is None:
ignore_paths = []
all_patterns: frozenset[str]
if language is not None:
support = get_language_support(language)
extensions = support.file_extensions
all_patterns = support.dir_excludes | _VCS_EXCLUDES
else:
extensions = tuple(get_supported_extensions())
all_patterns = _VCS_EXCLUDES
for lang in Language:
if is_language_supported(lang):
all_patterns = all_patterns | get_language_support(lang).dir_excludes
# Default directory patterns to always exclude for JS/TS
js_ts_default_excludes = {
"node_modules",
"dist",
"build",
".next",
".nuxt",
"coverage",
".cache",
".turbo",
".vercel",
"__pycache__",
}
dir_excludes, prefixes, suffixes = parse_dir_excludes(all_patterns)
files = []
for ext in extensions:
pattern = f"*{ext}"
for file_path in module_root_path.rglob(pattern):
# Check explicit ignore paths
if any(file_path.is_relative_to(ignore_path) for ignore_path in ignore_paths):
continue
# Check default JS/TS excludes in path parts
if any(part in js_ts_default_excludes for part in file_path.parts):
continue
files.append(file_path)
ignore_dirs: set[str] = set()
ignore_files: set[Path] = set()
for p in ignore_paths:
p = Path(p) if not isinstance(p, Path) else p
if p.is_file():
ignore_files.add(p)
else:
ignore_dirs.add(str(p))
files: list[Path] = []
for dirpath, dirnames, filenames in os.walk(module_root_path):
dirnames[:] = [
d
for d in dirnames
if d not in dir_excludes
and not (prefixes and d.startswith(prefixes))
and not (suffixes and d.endswith(suffixes))
and str(Path(dirpath) / d) not in ignore_dirs
]
for fname in filenames:
if fname.endswith(extensions):
fpath = Path(dirpath, fname)
if fpath not in ignore_files:
files.append(fpath)
return files
@ -804,6 +832,7 @@ def filter_functions(
*,
disable_logs: bool = False,
) -> tuple[dict[Path, list[FunctionToOptimize]], int]:
resolved_project_root = project_root.resolve()
filtered_modified_functions: dict[str, list[FunctionToOptimize]] = {}
blocklist_funcs = get_blocklisted_functions()
logger.debug(f"Blocklisted functions: {blocklist_funcs}")
@ -880,7 +909,7 @@ def filter_functions(
lang_support = get_language_support(Path(file_path))
if lang_support.language == Language.PYTHON:
try:
ast.parse(f"import {module_name_from_file_path(Path(file_path), project_root)}")
ast.parse(f"import {module_name_from_file_path(Path(file_path), resolved_project_root)}")
except SyntaxError:
malformed_paths_count += 1
continue
@ -902,7 +931,10 @@ def filter_functions(
if previous_checkpoint_functions:
functions_tmp = []
for function in _functions:
if function.qualified_name_with_modules_from_root(project_root) in previous_checkpoint_functions:
if (
function.qualified_name_with_modules_from_root(resolved_project_root)
in previous_checkpoint_functions
):
previous_checkpoint_functions_removed_count += 1
continue
functions_tmp.append(function)

View file

@ -294,6 +294,14 @@ class LanguageSupport(Protocol):
"""Like # or //."""
...
@property
def dir_excludes(self) -> frozenset[str]:
"""Directory name patterns to skip during file discovery.
Supports glob wildcards: "name" for exact, "prefix*" for startswith, "*suffix" for endswith.
"""
...
# === Discovery ===
def discover_functions(

View file

@ -44,8 +44,7 @@ class ImportResolver:
project_root: Root directory of the project.
"""
# Resolve to real path to handle macOS symlinks like /var -> /private/var
self.project_root = project_root.resolve()
self.project_root = project_root
self._resolution_cache: dict[tuple[Path, str], Path | None] = {}
def resolve_import(self, import_info: ImportInfo, source_file: Path) -> ResolvedImport | None:

View file

@ -63,6 +63,10 @@ class JavaScriptSupport:
def comment_prefix(self) -> str:
return "//"
@property
def dir_excludes(self) -> frozenset[str]:
return frozenset({"node_modules", "dist", "build", ".next", ".nuxt", "coverage", ".cache", ".turbo", ".vercel"})
# === Discovery ===
def discover_functions(

View file

@ -2,7 +2,6 @@ from __future__ import annotations
import ast
import hashlib
import os
from collections import defaultdict
from itertools import chain
from pathlib import Path
@ -37,7 +36,7 @@ from codeflash.optimization.function_context import belongs_to_function_qualifie
if TYPE_CHECKING:
from jedi.api.classes import Name
from codeflash.languages.base import DependencyResolver, HelperFunction
from codeflash.languages.base import HelperFunction
from codeflash.languages.python.context.unused_definition_remover import UsageInfo
# Error message constants
@ -81,7 +80,6 @@ def get_code_optimization_context(
project_root_path: Path,
optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
call_graph: DependencyResolver | None = None,
) -> CodeOptimizationContext:
# Route to language-specific implementation for non-Python languages
if not is_python():
@ -90,11 +88,9 @@ def get_code_optimization_context(
)
# Get FunctionSource representation of helpers of FTO
fto_input = {function_to_optimize.file_path: {function_to_optimize.qualified_name}}
if call_graph is not None:
helpers_of_fto_dict, helpers_of_fto_list = call_graph.get_callees(fto_input)
else:
helpers_of_fto_dict, helpers_of_fto_list = get_function_sources_from_jedi(fto_input, project_root_path)
helpers_of_fto_dict, helpers_of_fto_list = get_function_sources_from_jedi(
{function_to_optimize.file_path: {function_to_optimize.qualified_name}}, project_root_path
)
# Add function to optimize into helpers of FTO dict, as they'll be processed together
fto_as_function_source = get_function_to_optimize_as_function_source(function_to_optimize, project_root_path)
@ -110,13 +106,9 @@ def get_code_optimization_context(
for qualified_names in helpers_of_fto_qualified_names_dict.values():
qualified_names.update({f"{qn.rsplit('.', 1)[0]}.__init__" for qn in qualified_names if "." in qn})
# Get FunctionSource representation of helpers of helpers of FTO
if call_graph is not None:
helpers_of_helpers_dict, _helpers_of_helpers_list = call_graph.get_callees(helpers_of_fto_qualified_names_dict)
else:
helpers_of_helpers_dict, _helpers_of_helpers_list = get_function_sources_from_jedi(
helpers_of_fto_qualified_names_dict, project_root_path
)
helpers_of_helpers_dict, helpers_of_helpers_list = get_function_sources_from_jedi(
helpers_of_fto_qualified_names_dict, project_root_path
)
# Extract code context for optimization
final_read_writable_code = extract_code_markdown_context_from_files(
@ -192,6 +184,8 @@ def get_code_optimization_context(
code_hash_context = hashing_code_context.markdown
code_hash = hashlib.sha256(code_hash_context.encode("utf-8")).hexdigest()
all_helper_fqns = list({fs.fully_qualified_name for fs in helpers_of_fto_list + helpers_of_helpers_list})
return CodeOptimizationContext(
testgen_context=testgen_context,
read_writable_code=final_read_writable_code,
@ -199,6 +193,7 @@ def get_code_optimization_context(
hashing_code_context=code_hash_context,
hashing_code_context_hash=code_hash,
helper_functions=helpers_of_fto_list,
testgen_helper_fqns=all_helper_fqns,
preexisting_objects=preexisting_objects,
)
@ -257,7 +252,7 @@ def get_code_optimization_context_for_language(
fully_qualified_name=helper.qualified_name,
only_function_name=helper.name,
source_code=helper.source_code,
definition_type=None,
jedi_definition=None,
)
)
@ -323,13 +318,12 @@ def get_code_optimization_context_for_language(
return CodeOptimizationContext(
testgen_context=testgen_context,
read_writable_code=read_writable_code,
# Pass type definitions and globals as read-only context for the AI
# This way the AI sees them as context but doesn't include them in optimized output
read_only_context_code=code_context.read_only_context,
hashing_code_context=read_writable_code.flat,
hashing_code_context_hash=code_hash,
helper_functions=helper_function_sources,
preexisting_objects=set(), # Not implemented for non-Python yet
testgen_helper_fqns=[fs.fully_qualified_name for fs in helper_function_sources],
preexisting_objects=set(),
)
@ -480,7 +474,7 @@ def get_function_to_optimize_as_function_source(
fully_qualified_name=name.full_name,
only_function_name=name.name,
source_code=name.get_line_code(),
definition_type=name.type,
jedi_definition=name,
)
except Exception as e:
logger.exception(f"Error while getting function source: {e}")
@ -517,6 +511,10 @@ def get_function_sources_from_jedi(
# TODO: there can be multiple definitions, see how to handle such cases
definition = definitions[0]
definition_path = definition.module_path
if definition_path is not None:
rel = safe_relative_to(definition_path, project_root_path)
if not rel.is_absolute():
definition_path = project_root_path / rel
# The definition is part of this project and not defined within the original function
is_valid_definition = (
@ -525,15 +523,16 @@ def get_function_sources_from_jedi(
and not belongs_to_function_qualified(definition, qualified_function_name)
and definition.full_name.startswith(definition.module_name)
)
if is_valid_definition and definition.type in ("function", "class"):
if is_valid_definition and definition.type in ("function", "class", "statement"):
if definition.type == "function":
fqn = definition.full_name
func_name = definition.name
else:
# When a class is instantiated (e.g., MyClass()), track its __init__ as a helper
# This ensures the class definition with constructor is included in testgen context
elif definition.type == "class":
fqn = f"{definition.full_name}.__init__"
func_name = "__init__"
else:
fqn = definition.full_name
func_name = definition.name
qualified_name = get_qualified_name(definition.module_name, fqn)
# Avoid nested functions or classes. Only class.function is allowed
if len(qualified_name.split(".")) <= 2:
@ -543,7 +542,7 @@ def get_function_sources_from_jedi(
fully_qualified_name=fqn,
only_function_name=func_name,
source_code=definition.get_line_code(),
definition_type=definition.type,
jedi_definition=definition,
)
file_path_to_function_source[definition_path].add(function_source)
function_source_list.append(function_source)
@ -940,7 +939,11 @@ def is_project_path(module_path: Path | None, project_root_path: Path) -> bool:
# site-packages must be checked first because .venv/site-packages is under project root
if path_belongs_to_site_packages(module_path):
return False
return str(module_path).startswith(str(project_root_path) + os.sep)
try:
module_path.resolve().relative_to(project_root_path.resolve())
return True
except ValueError:
return False
def _is_project_module(module_name: str, project_root_path: Path) -> bool:

View file

@ -587,17 +587,20 @@ def revert_unused_helper_functions(
logger.debug(f"Reverting {len(unused_helpers)} unused helper function(s) to original definitions")
# Resolve all path keys for consistent comparison (Windows 8.3 short names may differ from Jedi-resolved paths)
resolved_original_helper_code = {p.resolve(): code for p, code in original_helper_code.items()}
# Group unused helpers by file path
unused_helpers_by_file = defaultdict(list)
for helper in unused_helpers:
unused_helpers_by_file[helper.file_path].append(helper)
unused_helpers_by_file[helper.file_path.resolve()].append(helper)
# For each file, revert the unused helper functions to their original definitions
for file_path, helpers_in_file in unused_helpers_by_file.items():
if file_path in original_helper_code:
if file_path in resolved_original_helper_code:
try:
# Get original code for this file
original_code = original_helper_code[file_path]
original_code = resolved_original_helper_code[file_path]
# Use the code replacer to selectively revert only the unused helper functions
helper_names = [helper.qualified_name for helper in helpers_in_file]

View file

@ -76,6 +76,37 @@ class PythonSupport:
def comment_prefix(self) -> str:
return "#"
@property
def dir_excludes(self) -> frozenset[str]:
return frozenset(
{
"__pycache__",
".venv",
"venv",
".tox",
".nox",
".eggs",
".mypy_cache",
".ruff_cache",
".pytest_cache",
".hypothesis",
"htmlcov",
".pytype",
".pyre",
".pybuilder",
".ipynb_checkpoints",
".codeflash",
".cache",
".complexipy_cache",
"build",
"dist",
"sdist",
".coverage*",
".pyright*",
"*.egg-info",
}
)
# === Discovery ===
def discover_functions(

View file

@ -379,6 +379,7 @@ class CodeOptimizationContext(BaseModel):
hashing_code_context: str = ""
hashing_code_context_hash: str = ""
helper_functions: list[FunctionSource]
testgen_helper_fqns: list[str] = []
preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]]

View file

@ -2,6 +2,7 @@ from __future__ import annotations
import ast
import concurrent.futures
import dataclasses
import logging
import os
import queue
@ -445,9 +446,12 @@ class FunctionOptimizer:
replay_tests_dir: Path | None = None,
call_graph: DependencyResolver | None = None,
) -> None:
self.project_root = test_cfg.project_root_path
self.project_root = test_cfg.project_root_path.resolve()
self.test_cfg = test_cfg
self.aiservice_client = aiservice_client if aiservice_client else AiServiceClient()
resolved_file_path = function_to_optimize.file_path.resolve()
if resolved_file_path != function_to_optimize.file_path:
function_to_optimize = dataclasses.replace(function_to_optimize, file_path=resolved_file_path)
self.function_to_optimize = function_to_optimize
self.function_to_optimize_source_code = (
function_to_optimize_source_code
@ -582,6 +586,7 @@ class FunctionOptimizer:
test_results = self.generate_tests(
testgen_context=code_context.testgen_context,
helper_functions=code_context.helper_functions,
testgen_helper_fqns=code_context.testgen_helper_fqns,
generated_test_paths=generated_test_paths,
generated_perf_test_paths=generated_perf_test_paths,
)
@ -1453,7 +1458,7 @@ class FunctionOptimizer:
optimized_code = ""
if optimized_context is not None:
file_to_code_context = optimized_context.file_to_path()
optimized_code = file_to_code_context.get(str(path.relative_to(self.project_root)), "")
optimized_code = file_to_code_context.get(str(path.resolve().relative_to(self.project_root)), "")
new_code = format_code(
self.args.formatter_cmds, path, optimized_code=optimized_code, check_diff=True, exit_on_failure=False
@ -1524,7 +1529,8 @@ class FunctionOptimizer:
read_only_context_code=new_code_ctx.read_only_context_code,
hashing_code_context=new_code_ctx.hashing_code_context,
hashing_code_context_hash=new_code_ctx.hashing_code_context_hash,
helper_functions=new_code_ctx.helper_functions, # only functions that are read writable
helper_functions=new_code_ctx.helper_functions,
testgen_helper_fqns=new_code_ctx.testgen_helper_fqns,
preexisting_objects=new_code_ctx.preexisting_objects,
)
)
@ -1730,6 +1736,7 @@ class FunctionOptimizer:
self,
testgen_context: CodeStringsMarkdown,
helper_functions: list[FunctionSource],
testgen_helper_fqns: list[str],
generated_test_paths: list[Path],
generated_perf_test_paths: list[Path],
) -> Result[tuple[int, GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str], str]:
@ -1738,13 +1745,9 @@ class FunctionOptimizer:
assert len(generated_test_paths) == n_tests
if not self.args.no_gen_tests:
# Submit test generation tasks
helper_fqns = testgen_helper_fqns or [definition.fully_qualified_name for definition in helper_functions]
future_tests = self.submit_test_generation_tasks(
self.executor,
testgen_context.markdown,
[definition.fully_qualified_name for definition in helper_functions],
generated_test_paths,
generated_perf_test_paths,
self.executor, testgen_context.markdown, helper_fqns, generated_test_paths, generated_perf_test_paths
)
future_concolic_tests = self.executor.submit(

View file

@ -126,10 +126,10 @@ def existing_tests_source_for(
tests_dir_name = test_cfg.tests_project_rootdir.name
if file_path.startswith((tests_dir_name + os.sep, tests_dir_name + "/")):
# Module path includes "tests." - use project root parent
instrumented_abs_path = (test_cfg.tests_project_rootdir.parent / file_path).resolve()
instrumented_abs_path = test_cfg.tests_project_rootdir.parent / file_path
else:
# Module path doesn't include tests dir - use tests root directly
instrumented_abs_path = (test_cfg.tests_project_rootdir / file_path).resolve()
instrumented_abs_path = test_cfg.tests_project_rootdir / file_path
logger.debug(f"[PR-DEBUG] Looking up: {instrumented_abs_path}")
logger.debug(f"[PR-DEBUG] Available keys: {list(instrumented_to_original.keys())[:3]}")
# Try to map instrumented path to original path

View file

@ -2,6 +2,7 @@ import logging
import sentry_sdk
from sentry_sdk.integrations.logging import LoggingIntegration
from sentry_sdk.integrations.stdlib import StdlibIntegration
def init_sentry(*, enabled: bool = False, exclude_errors: bool = False) -> None:
@ -16,12 +17,8 @@ def init_sentry(*, enabled: bool = False, exclude_errors: bool = False) -> None:
sentry_sdk.init(
dsn="https://4b9a1902f9361b48c04376df6483bc96@o4506833230561280.ingest.sentry.io/4506833262477312",
integrations=[sentry_logging],
# Set traces_sample_rate to 1.0 to capture 100%
# of transactions for performance monitoring.
traces_sample_rate=1.0,
# Set profiles_sample_rate to 1.0 to profile 100%
# of sampled transactions.
# We recommend adjusting this value in production.
profiles_sample_rate=1.0,
disabled_integrations=[StdlibIntegration],
traces_sample_rate=0,
profiles_sample_rate=0,
ignore_errors=[KeyboardInterrupt],
)

View file

@ -1,6 +1,7 @@
from __future__ import annotations
import ast
import importlib.util
import subprocess
import tempfile
import time
@ -18,6 +19,8 @@ from codeflash.lsp.helpers import is_LSP_enabled
from codeflash.telemetry.posthog_cf import ph
from codeflash.verification.verification_utils import TestConfig
CROSSHAIR_AVAILABLE = importlib.util.find_spec("crosshair") is not None
if TYPE_CHECKING:
from argparse import Namespace
@ -52,6 +55,10 @@ def generate_concolic_tests(
logger.debug("Skipping concolic test generation for non-Python languages (CrossHair is Python-only)")
return function_to_concolic_tests, concolic_test_suite_code
if not CROSSHAIR_AVAILABLE:
logger.debug("Skipping concolic test generation (crosshair-tool is not installed)")
return function_to_concolic_tests, concolic_test_suite_code
if is_LSP_enabled():
logger.debug("Skipping concolic test generation in LSP mode")
return function_to_concolic_tests, concolic_test_suite_code

View file

@ -47,8 +47,24 @@ def parse_func(file_path: Path) -> XMLParser:
return parse(file_path, xml_parser)
matches_re_start = re.compile(r"!\$######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######\$!\n")
matches_re_end = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!")
matches_re_start = re.compile(
r"!\$######([^:]*)" # group 1: module path
r":((?:[^:.]*\.)*)" # group 2: class prefix with trailing dot, or empty
r"([^.:]*)" # group 3: test function name
r":([^:]*)" # group 4: function being tested
r":([^:]*)" # group 5: loop index
r":([^#]*)" # group 6: iteration id
r"######\$!\n"
)
matches_re_end = re.compile(
r"!######([^:]*)" # group 1: module path
r":((?:[^:.]*\.)*)" # group 2: class prefix with trailing dot, or empty
r"([^.:]*)" # group 3: test function name
r":([^:]*)" # group 4: function being tested
r":([^:]*)" # group 5: loop index
r":([^#]*)" # group 6: iteration_id or iteration_id:runtime
r"######!"
)
start_pattern = re.compile(r"!\$######([^:]*):([^:]*):([^:]*):([^:]*):([^:]+)######\$!")
@ -893,7 +909,6 @@ def merge_test_results(
return merged_test_results
FAILURES_HEADER_RE = re.compile(r"=+ FAILURES =+")
TEST_HEADER_RE = re.compile(r"_{3,}\s*(.*?)\s*_{3,}$")
@ -903,7 +918,7 @@ def parse_test_failures_from_stdout(stdout: str) -> dict[str, str]:
start = end = None
for i, line in enumerate(lines):
if FAILURES_HEADER_RE.search(line.strip()):
if "= FAILURES =" in line:
start = i
break

View file

@ -158,6 +158,10 @@ class TestConfig:
_language: Optional[str] = None # Language identifier for multi-language support
js_project_root: Optional[Path] = None # JavaScript project root (directory containing package.json)
def __post_init__(self) -> None:
self.project_root_path = self.project_root_path.resolve()
self.tests_project_rootdir = self.tests_project_rootdir.resolve()
@property
def test_framework(self) -> str:
"""Returns the appropriate test framework based on language.

View file

@ -39,7 +39,7 @@ dependencies = [
"dill>=0.3.8",
"rich>=13.8.1",
"lxml>=5.3.0",
"crosshair-tool>=0.0.78",
"crosshair-tool>=0.0.78; python_version < '3.15'",
"coverage>=7.6.4",
"line_profiler>=4.2.0",
"platformdirs>=4.3.7",
@ -207,6 +207,8 @@ warn_unreachable = true
install_types = true
plugins = ["pydantic.mypy"]
exclude = ["tests/", "code_to_optimize/", "pie_test_set/", "experiments/"]
[[tool.mypy.overrides]]
module = ["jedi", "jedi.api.classes", "inquirer", "inquirer.themes", "numba"]
ignore_missing_imports = true
@ -310,6 +312,9 @@ split-on-trailing-comma = false
docstring-code-format = true
skip-magic-trailing-comma = true
[tool.ty.src]
exclude = ["tests", "code_to_optimize", "pie_test_set", "experiments"]
[tool.hatch.version]
source = "uv-dynamic-versioning"

View file

@ -20,7 +20,7 @@
"version": "0.13.0"
},
"tessl/pypi-pydantic": {
"version": "1.10.0"
"version": "2.11.0"
},
"tessl/pypi-humanize": {
"version": "4.13.0"
@ -35,7 +35,7 @@
"version": "3.4.0"
},
"tessl/pypi-sentry-sdk": {
"version": "1.45.0"
"version": "2.36.0"
},
"tessl/pypi-parameterized": {
"version": "0.9.0"
@ -44,10 +44,10 @@
"version": "0.4.0"
},
"tessl/pypi-rich": {
"version": "13.9.0"
"version": "14.1.0"
},
"tessl/pypi-lxml": {
"version": "5.4.0"
"version": "6.0.0"
},
"tessl/pypi-crosshair-tool": {
"version": "0.0.0"
@ -64,17 +64,20 @@
"tessl/pypi-filelock": {
"version": "3.19.0"
},
"codeflash/codeflash-rules": {
"version": "0.1.0"
"tessl/pypi-ipython": {
"version": "9.5.0"
},
"codeflash/codeflash-docs": {
"version": "0.1.0"
"tessl/pypi-mypy": {
"version": "1.17.0"
},
"codeflash/codeflash-skills": {
"version": "0.2.0"
"tessl/pypi-ty": {
"version": "0.0.0"
},
"tessl-labs/tessl-skill-eval-scenarios": {
"version": "0.0.5"
"tessl/pypi-types-jsonschema": {
"version": "3.2.0"
},
"tessl/pypi-uv": {
"version": "0.8.0"
}
}
}

View file

@ -1171,7 +1171,12 @@ def test_repo_helper() -> None:
code_ctx = get_code_optimization_context(function_to_optimize, project_root)
read_write_context, read_only_context = code_ctx.read_writable_code, code_ctx.read_only_context_code
hashing_context = code_ctx.hashing_code_context
path_to_globals = project_root / "globals.py"
expected_read_write_context = f"""
```python:{path_to_globals.relative_to(project_root)}
# Define a global variable
API_URL = "https://api.example.com/data"
```
```python:{path_to_utils.relative_to(project_root)}
import math
@ -1264,7 +1269,12 @@ def test_repo_helper_of_helper() -> None:
code_ctx = get_code_optimization_context(function_to_optimize, project_root)
read_write_context, read_only_context = code_ctx.read_writable_code, code_ctx.read_only_context_code
hashing_context = code_ctx.hashing_code_context
path_to_globals = project_root / "globals.py"
expected_read_write_context = f"""
```python:{path_to_globals.relative_to(project_root)}
# Define a global variable
API_URL = "https://api.example.com/data"
```
```python:{path_to_utils.relative_to(project_root)}
import math
from transform_utils import DataTransformer
@ -1991,6 +2001,8 @@ class Calculator:
"""
expected_read_only_context = """
```python:utility_module.py
import sys
DEFAULT_PRECISION = "medium"
# Try-except block with variable definitions
@ -2001,6 +2013,17 @@ except ImportError:
# Used variable in except block
CALCULATION_BACKEND = "python"
# Nested if-else with variable definitions
if sys.platform.startswith('win'):
# Used variable in outer if
SYSTEM_TYPE = "windows"
elif sys.platform.startswith('linux'):
# Used variable in outer elif
SYSTEM_TYPE = "linux"
else:
# Used variable in outer else
SYSTEM_TYPE = "other"
# Function that will be used in the main code
def select_precision(precision, fallback_precision):
if precision is None:
@ -2207,6 +2230,8 @@ def get_system_details():
relative_path = file_path.relative_to(project_root)
expected_read_write_context = f"""
```python:utility_module.py
import sys
DEFAULT_PRECISION = "medium"
# Try-except block with variable definitions
@ -2217,6 +2242,17 @@ except ImportError:
# Used variable in except block
CALCULATION_BACKEND = "python"
# Nested if-else with variable definitions
if sys.platform.startswith('win'):
# Used variable in outer if
SYSTEM_TYPE = "windows"
elif sys.platform.startswith('linux'):
# Used variable in outer elif
SYSTEM_TYPE = "linux"
else:
# Used variable in outer else
SYSTEM_TYPE = "other"
# Function that will be used in the main code
def select_precision(precision, fallback_precision):
if precision is None:
@ -2257,6 +2293,8 @@ class Calculator:
"""
expected_read_only_context = """
```python:utility_module.py
import sys
DEFAULT_PRECISION = "medium"
# Try-except block with variable definitions
@ -2266,6 +2304,17 @@ try:
except ImportError:
# Used variable in except block
CALCULATION_BACKEND = "python"
# Nested if-else with variable definitions
if sys.platform.startswith('win'):
# Used variable in outer if
SYSTEM_TYPE = "windows"
elif sys.platform.startswith('linux'):
# Used variable in outer elif
SYSTEM_TYPE = "linux"
else:
# Used variable in outer else
SYSTEM_TYPE = "other"
```
"""
assert read_write_context.markdown.strip() == expected_read_write_context.strip()

View file

@ -0,0 +1,189 @@
"""Tests for the regex patterns and string matching in parse_test_output.py."""
from codeflash.verification.parse_test_output import (
matches_re_end,
matches_re_start,
parse_test_failures_from_stdout,
)
# --- matches_re_start tests ---
class TestMatchesReStart:
def test_simple_no_class(self) -> None:
s = "!$######tests.test_foo:test_bar:target_func:1:abc######$!\n"
m = matches_re_start.search(s)
assert m is not None
assert m.groups() == ("tests.test_foo", "", "test_bar", "target_func", "1", "abc")
def test_with_class(self) -> None:
s = "!$######tests.test_foo:MyClass.test_bar:target_func:1:abc######$!\n"
m = matches_re_start.search(s)
assert m is not None
assert m.groups() == ("tests.test_foo", "MyClass.", "test_bar", "target_func", "1", "abc")
def test_nested_class(self) -> None:
s = "!$######a.b.c:A.B.test_x:func:3:id123######$!\n"
m = matches_re_start.search(s)
assert m is not None
assert m.groups() == ("a.b.c", "A.B.", "test_x", "func", "3", "id123")
def test_empty_class_and_function(self) -> None:
s = "!$######mod::func:0:iter######$!\n"
m = matches_re_start.search(s)
assert m is not None
assert m.groups() == ("mod", "", "", "func", "0", "iter")
def test_embedded_in_stdout(self) -> None:
s = "some output\n!$######mod:test_fn:f:1:x######$!\nmore output\n"
m = matches_re_start.search(s)
assert m is not None
assert m.groups() == ("mod", "", "test_fn", "f", "1", "x")
def test_multiple_matches(self) -> None:
s = (
"!$######m1:C1.fn1:t1:1:a######$!\n"
"!$######m2:fn2:t2:2:b######$!\n"
)
matches = list(matches_re_start.finditer(s))
assert len(matches) == 2
assert matches[0].groups() == ("m1", "C1.", "fn1", "t1", "1", "a")
assert matches[1].groups() == ("m2", "", "fn2", "t2", "2", "b")
def test_no_match_without_newline(self) -> None:
s = "!$######mod:test_fn:f:1:x######$!"
m = matches_re_start.search(s)
assert m is None
def test_dots_in_module_path(self) -> None:
s = "!$######a.b.c.d.e:test_fn:f:1:x######$!\n"
m = matches_re_start.search(s)
assert m is not None
assert m.group(1) == "a.b.c.d.e"
# --- matches_re_end tests ---
class TestMatchesReEnd:
def test_simple_no_class_with_runtime(self) -> None:
s = "!######tests.test_foo:test_bar:target_func:1:abc:12345######!"
m = matches_re_end.search(s)
assert m is not None
assert m.groups() == ("tests.test_foo", "", "test_bar", "target_func", "1", "abc:12345")
def test_with_class_no_runtime(self) -> None:
s = "!######tests.test_foo:MyClass.test_bar:target_func:1:abc######!"
m = matches_re_end.search(s)
assert m is not None
assert m.groups() == ("tests.test_foo", "MyClass.", "test_bar", "target_func", "1", "abc")
def test_nested_class_with_runtime(self) -> None:
s = "!######mod:A.B.test_x:func:3:id123:99999######!"
m = matches_re_end.search(s)
assert m is not None
assert m.groups() == ("mod", "A.B.", "test_x", "func", "3", "id123:99999")
def test_runtime_colon_preserved_in_group6(self) -> None:
"""Group 6 must capture 'iteration_id:runtime' as a single string (colon included)."""
s = "!######m:fn:f:1:iter42:98765######!"
m = matches_re_end.search(s)
assert m is not None
assert m.group(6) == "iter42:98765"
def test_embedded_in_stdout(self) -> None:
s = "captured output\n!######mod:test_fn:f:1:x:500######!\nmore"
m = matches_re_end.search(s)
assert m is not None
assert m.groups() == ("mod", "", "test_fn", "f", "1", "x:500")
# --- Start/End pairing (simulates parse_test_xml matching logic) ---
class TestStartEndPairing:
def test_paired_markers(self) -> None:
stdout = (
"!$######mod:Class.test_fn:func:1:iter1######$!\n"
"test output here\n"
"!######mod:Class.test_fn:func:1:iter1:54321######!"
)
starts = list(matches_re_start.finditer(stdout))
ends = {}
for match in matches_re_end.finditer(stdout):
groups = match.groups()
g5 = groups[5]
colon_pos = g5.find(":")
if colon_pos != -1:
key = groups[:5] + (g5[:colon_pos],)
else:
key = groups
ends[key] = match
assert len(starts) == 1
assert len(ends) == 1
# Start and end should pair on the first 5 groups + iteration_id
start_groups = starts[0].groups()
assert start_groups in ends
# --- parse_test_failures_from_stdout tests ---
class TestParseTestFailuresHeader:
def test_standard_pytest_header(self) -> None:
stdout = (
"..F.\n"
"=================================== FAILURES ===================================\n"
"_______ test_foo _______\n"
"\n"
" def test_foo():\n"
"> assert False\n"
"E AssertionError\n"
"\n"
"test.py:3: AssertionError\n"
"=========================== short test summary info ============================\n"
"FAILED test.py::test_foo\n"
)
result = parse_test_failures_from_stdout(stdout)
assert "test_foo" in result
def test_minimal_equals(self) -> None:
"""Even a short '= FAILURES =' header should be detected."""
stdout = (
"= FAILURES =\n"
"_______ test_bar _______\n"
"\n"
" assert False\n"
"\n"
"test.py:1: AssertionError\n"
"= short test summary info =\n"
)
result = parse_test_failures_from_stdout(stdout)
assert "test_bar" in result
def test_no_failures_section(self) -> None:
stdout = "....\n4 passed in 0.1s\n"
result = parse_test_failures_from_stdout(stdout)
assert result == {}
def test_word_failures_without_equals_is_not_matched(self) -> None:
"""'FAILURES' without surrounding '=' signs should not trigger the header detection."""
stdout = (
"FAILURES detected in module\n"
"_______ test_baz _______\n"
"\n"
" assert False\n"
)
result = parse_test_failures_from_stdout(stdout)
assert result == {}
def test_failures_in_test_output_not_matched(self) -> None:
"""A test printing 'FAILURES' (no = signs) should not trigger header detection."""
stdout = (
"Testing FAILURES handling\n"
"All good\n"
)
result = parse_test_failures_from_stdout(stdout)
assert result == {}

View file

@ -1,108 +0,0 @@
# AI Service
How codeflash communicates with the AI optimization backend.
## `AiServiceClient` (`api/aiservice.py`)
The client connects to the AI service at `https://app.codeflash.ai` (or `http://localhost:8000` when `CODEFLASH_AIS_SERVER=local`).
Authentication uses Bearer token from `get_codeflash_api_key()`. All requests go through `make_ai_service_request()` which handles JSON serialization via Pydantic encoder.
Timeout: 90s for production, 300s for local.
## Endpoints
### `/ai/optimize` — Generate Candidates
Method: `optimize_code()`
Sends source code + dependency context to generate optimization candidates.
Payload:
- `source_code` — The read-writable code (markdown format)
- `dependency_code` — Read-only context code
- `trace_id` — Unique trace ID for the optimization run
- `language``"python"`, `"javascript"`, or `"typescript"`
- `n_candidates` — Number of candidates to generate (controlled by effort level)
- `is_async` — Whether the function is async
- `is_numerical_code` — Whether the code is numerical (affects optimization strategy)
Returns: `list[OptimizedCandidate]` with `source=OptimizedCandidateSource.OPTIMIZE`
### `/ai/optimize_line_profiler` — Line-Profiler-Guided Candidates
Method: `optimize_python_code_line_profiler()`
Like `/optimize` but includes `line_profiler_results` to guide the LLM toward hot lines.
Returns: candidates with `source=OptimizedCandidateSource.OPTIMIZE_LP`
### `/ai/refine` — Refine Existing Candidate
Method: `refine_code()`
Request type: `AIServiceRefinerRequest`
Sends an existing candidate with runtime data and line profiler results to generate an improved version.
Key fields:
- `original_source_code` / `optimized_source_code` — Before and after
- `original_code_runtime` / `optimized_code_runtime` — Timing data
- `speedup` — Current speedup ratio
- `original_line_profiler_results` / `optimized_line_profiler_results`
Returns: candidates with `source=OptimizedCandidateSource.REFINE` and `parent_id` set to the refined candidate's ID
### `/ai/repair` — Fix Failed Candidate
Method: `repair_code()`
Request type: `AIServiceCodeRepairRequest`
Sends a failed candidate with test diffs showing what went wrong.
Key fields:
- `original_source_code` / `modified_source_code`
- `test_diffs: list[TestDiff]` — Each with `scope` (return_value/stdout/did_pass), original vs candidate values, and test source code
Returns: candidates with `source=OptimizedCandidateSource.REPAIR` and `parent_id` set
### `/ai/adaptive_optimize` — Multi-Candidate Adaptive
Method: `adaptive_optimize()`
Request type: `AIServiceAdaptiveOptimizeRequest`
Sends multiple previous candidates with their speedups for the LLM to learn from and generate better candidates.
Key fields:
- `candidates: list[AdaptiveOptimizedCandidate]` — Previous candidates with source code, explanation, source type, and speedup
Returns: candidates with `source=OptimizedCandidateSource.ADAPTIVE`
### `/ai/rewrite_jit` — JIT Rewrite
Method: `get_jit_rewritten_code()`
Rewrites code to use JIT compilation (e.g., Numba).
Returns: candidates with `source=OptimizedCandidateSource.JIT_REWRITE`
## Candidate Parsing
All endpoints return JSON with an `optimizations` array. Each entry has:
- `source_code` — Markdown-formatted code blocks
- `explanation` — LLM explanation
- `optimization_id` — Unique ID
- `parent_id` — Optional parent reference
- `model` — Which LLM model was used
`_get_valid_candidates()` parses the markdown code via `CodeStringsMarkdown.parse_markdown_code()` and filters out entries with empty code blocks.
## `LocalAiServiceClient`
Used when `CODEFLASH_EXPERIMENT_ID` is set. Mirrors `AiServiceClient` but sends to a separate experimental endpoint for A/B testing optimization strategies.
## LLM Call Sequencing
`AiServiceClient` tracks call sequence via `llm_call_counter` (itertools.count). Each request includes a `call_sequence` number, used by the backend to maintain conversation context across multiple calls for the same function.

View file

@ -1,79 +0,0 @@
# Configuration
Key configuration constants, effort levels, and thresholds.
## Constants (`code_utils/config_consts.py`)
### Test Execution
| Constant | Value | Description |
|----------|-------|-------------|
| `MAX_TEST_RUN_ITERATIONS` | 5 | Maximum test loop iterations |
| `INDIVIDUAL_TESTCASE_TIMEOUT` | 15s | Timeout per individual test case |
| `MAX_FUNCTION_TEST_SECONDS` | 60s | Max total time for function testing |
| `MAX_TEST_FUNCTION_RUNS` | 50 | Max test function executions |
| `MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS` | 100ms | Max cumulative test runtime |
| `TOTAL_LOOPING_TIME` | 10s | Candidate benchmarking budget |
| `MIN_TESTCASE_PASSED_THRESHOLD` | 6 | Minimum test cases that must pass |
### Performance Thresholds
| Constant | Value | Description |
|----------|-------|-------------|
| `MIN_IMPROVEMENT_THRESHOLD` | 0.05 (5%) | Minimum speedup to accept a candidate |
| `MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD` | 0.10 (10%) | Minimum async throughput improvement |
| `MIN_CONCURRENCY_IMPROVEMENT_THRESHOLD` | 0.20 (20%) | Minimum concurrency ratio improvement |
| `COVERAGE_THRESHOLD` | 60.0% | Minimum test coverage |
### Stability Thresholds
| Constant | Value | Description |
|----------|-------|-------------|
| `STABILITY_WINDOW_SIZE` | 0.35 | 35% of total iteration window |
| `STABILITY_CENTER_TOLERANCE` | 0.0025 | ±0.25% around median |
| `STABILITY_SPREAD_TOLERANCE` | 0.0025 | 0.25% window spread |
### Context Limits
| Constant | Value | Description |
|----------|-------|-------------|
| `OPTIMIZATION_CONTEXT_TOKEN_LIMIT` | 16000 | Max tokens for optimization context |
| `TESTGEN_CONTEXT_TOKEN_LIMIT` | 16000 | Max tokens for test generation context |
| `MAX_CONTEXT_LEN_REVIEW` | 1000 | Max context length for optimization review |
### Other
| Constant | Value | Description |
|----------|-------|-------------|
| `MIN_CORRECT_CANDIDATES` | 2 | Min correct candidates before skipping repair |
| `REPEAT_OPTIMIZATION_PROBABILITY` | 0.1 | Probability of re-optimizing a function |
| `DEFAULT_IMPORTANCE_THRESHOLD` | 0.001 | Minimum addressable time to consider a function |
| `CONCURRENCY_FACTOR` | 10 | Number of concurrent executions for concurrency benchmark |
| `REFINED_CANDIDATE_RANKING_WEIGHTS` | (2, 1) | (runtime, diff) weights — runtime 2x more important |
## Effort Levels
`EffortLevel` enum: `LOW`, `MEDIUM`, `HIGH`
Effort controls the number of candidates, repairs, and refinements:
| Key | LOW | MEDIUM | HIGH |
|-----|-----|--------|------|
| `N_OPTIMIZER_CANDIDATES` | 3 | 5 | 6 |
| `N_OPTIMIZER_LP_CANDIDATES` | 4 | 6 | 7 |
| `N_GENERATED_TESTS` | 2 | 2 | 2 |
| `MAX_CODE_REPAIRS_PER_TRACE` | 2 | 3 | 5 |
| `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` | 0.2 | 0.3 | 0.4 |
| `TOP_VALID_CANDIDATES_FOR_REFINEMENT` | 2 | 3 | 4 |
| `ADAPTIVE_OPTIMIZATION_THRESHOLD` | 0 | 0 | 2 |
| `MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE` | 0 | 0 | 4 |
Use `get_effort_value(EffortKeys.KEY, effort_level)` to retrieve values.
## Project Configuration
Configuration is read from `pyproject.toml` under `[tool.codeflash]`. Key settings are auto-detected by `setup/detector.py`:
- `module-root` — Root of the module to optimize
- `tests-root` — Root of test files
- `test-framework` — pytest, unittest, jest, etc.
- `formatter-cmds` — Code formatting commands

View file

@ -1,60 +0,0 @@
# Context Extraction
How codeflash extracts and limits code context for optimization and test generation.
## Overview
Context extraction (`context/code_context_extractor.py`) builds a `CodeOptimizationContext` containing all code needed for the LLM to understand and optimize a function, split into:
- **Read-writable code** (`CodeContextType.READ_WRITABLE`): The function being optimized plus its helper functions — code the LLM is allowed to modify
- **Read-only context** (`CodeContextType.READ_ONLY`): Dependency code for reference — imports, type definitions, base classes
- **Testgen context** (`CodeContextType.TESTGEN`): Context for test generation, may include imported class definitions and external base class inits
- **Hashing context** (`CodeContextType.HASHING`): Used for deduplication of optimization runs
## Token Limits
Both optimization and test generation contexts are token-limited:
- `OPTIMIZATION_CONTEXT_TOKEN_LIMIT = 16000` tokens
- `TESTGEN_CONTEXT_TOKEN_LIMIT = 16000` tokens
Token counting uses `encoded_tokens_len()` from `code_utils/code_utils.py`. Functions whose context exceeds these limits are skipped.
## Context Building Process
### 1. Helper Discovery
For the target function (`FunctionToOptimize`), the extractor finds:
- **Helpers of the function**: Functions/classes in the same file that the target function calls
- **Helpers of helpers**: Transitive dependencies of the helper functions
These are organized as `dict[Path, set[FunctionSource]]` — mapping file paths to the set of helper functions found in each file.
### 2. Code Extraction
`extract_code_markdown_context_from_files()` builds `CodeStringsMarkdown` from the helper dictionaries. Each file's relevant code is extracted as a `CodeString` with its file path.
### 3. Testgen Context Enrichment
`build_testgen_context()` extends the basic context with:
- Imported class definitions (resolved from imports)
- External base class `__init__` methods
- External class `__init__` methods referenced in the context
### 4. Unused Definition Removal
`detect_unused_helper_functions()` and `remove_unused_definitions_by_function_names()` from `context/unused_definition_remover.py` prune definitions that are not transitively reachable from the target function, reducing token usage.
### 5. Deduplication
The hashing context (`hashing_code_context`) generates a hash (`hashing_code_context_hash`) used to detect when the same function context has already been optimized in a previous run, avoiding redundant work.
## Key Functions
| Function | Location | Purpose |
|----------|----------|---------|
| `build_testgen_context()` | `context/code_context_extractor.py` | Build enriched testgen context |
| `extract_code_markdown_context_from_files()` | `context/code_context_extractor.py` | Convert helper dicts to `CodeStringsMarkdown` |
| `detect_unused_helper_functions()` | `context/unused_definition_remover.py` | Find unused definitions |
| `remove_unused_definitions_by_function_names()` | `context/unused_definition_remover.py` | Remove unused definitions |
| `collect_top_level_defs_with_usages()` | `context/unused_definition_remover.py` | Analyze definition usage |
| `encoded_tokens_len()` | `code_utils/code_utils.py` | Count tokens in code |

View file

@ -1,153 +0,0 @@
# Domain Types
Core data types used throughout the codeflash optimization pipeline.
## Function Representation
### `FunctionToOptimize` (`models/function_types.py`)
The canonical dataclass representing a function candidate for optimization. Works across Python, JavaScript, and TypeScript.
Key fields:
- `function_name: str` — The function name
- `file_path: Path` — Absolute file path where the function is located
- `parents: list[FunctionParent]` — Parent scopes (classes/functions), each with `name` and `type`
- `starting_line / ending_line: Optional[int]` — Line range (1-indexed)
- `is_async: bool` — Whether the function is async
- `is_method: bool` — Whether it belongs to a class
- `language: str` — Programming language (default: `"python"`)
Key properties:
- `qualified_name` — Full dotted name including parent classes (e.g., `MyClass.my_method`)
- `top_level_parent_name` — Name of outermost parent, or function name if no parents
- `class_name` — Immediate parent class name, or `None`
### `FunctionParent` (`models/function_types.py`)
Represents a parent scope: `name: str` (e.g., `"MyClass"`) and `type: str` (e.g., `"ClassDef"`).
### `FunctionSource` (`models/models.py`)
Represents a resolved function with source code. Used for helper functions in context extraction.
Fields: `file_path`, `qualified_name`, `fully_qualified_name`, `only_function_name`, `source_code`, `jedi_definition`.
## Code Representation
### `CodeString` (`models/models.py`)
A single code block with validated syntax:
- `code: str` — The source code
- `file_path: Optional[Path]` — Origin file path
- `language: str` — Language for validation (default: `"python"`)
Validates syntax on construction via `model_validator`.
### `CodeStringsMarkdown` (`models/models.py`)
A collection of `CodeString` blocks — the primary format for passing code through the pipeline.
Key properties:
- `.flat` — Combined source code with file-path comment prefixes (e.g., `# file: path/to/file.py`)
- `.markdown` — Markdown-formatted with fenced code blocks: `` ```python:filepath\ncode\n``` ``
- `.file_to_path()` — Dict mapping file path strings to code
Static method:
- `parse_markdown_code(markdown_code, expected_language)` — Parses markdown code blocks back into `CodeStringsMarkdown`
## Optimization Context
### `CodeOptimizationContext` (`models/models.py`)
Holds all code context needed for optimization:
- `read_writable_code: CodeStringsMarkdown` — Code the LLM can modify
- `read_only_context_code: str` — Reference-only dependency code
- `testgen_context: CodeStringsMarkdown` — Context for test generation
- `hashing_code_context: str` / `hashing_code_context_hash: str` — For deduplication
- `helper_functions: list[FunctionSource]` — Helper functions in the writable code
- `preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]]` — Objects that already exist in the code
### `CodeContextType` enum (`models/models.py`)
Defines context categories: `READ_WRITABLE`, `READ_ONLY`, `TESTGEN`, `HASHING`.
## Candidates
### `OptimizedCandidate` (`models/models.py`)
A generated code variant:
- `source_code: CodeStringsMarkdown` — The optimized code
- `explanation: str` — LLM explanation of the optimization
- `optimization_id: str` — Unique identifier
- `source: OptimizedCandidateSource` — How it was generated
- `parent_id: str | None` — ID of parent candidate (for refinements/repairs)
- `model: str | None` — Which LLM model generated it
### `OptimizedCandidateSource` enum (`models/models.py`)
How a candidate was generated: `OPTIMIZE`, `OPTIMIZE_LP` (line profiler), `REFINE`, `REPAIR`, `ADAPTIVE`, `JIT_REWRITE`.
### `CandidateEvaluationContext` (`models/models.py`)
Tracks state during candidate evaluation:
- `speedup_ratios` / `optimized_runtimes` / `is_correct` — Per-candidate results
- `ast_code_to_id` — Deduplication map (normalized AST → first seen candidate)
- `valid_optimizations` — Candidates that passed all checks
Key methods: `record_failed_candidate()`, `record_successful_candidate()`, `handle_duplicate_candidate()`, `register_new_candidate()`.
## Baseline & Results
### `OriginalCodeBaseline` (`models/models.py`)
Baseline measurements for the original code:
- `behavior_test_results: TestResults` / `benchmarking_test_results: TestResults`
- `line_profile_results: dict`
- `runtime: int` — Total runtime in nanoseconds
- `coverage_results: Optional[CoverageData]`
### `BestOptimization` (`models/models.py`)
The winning candidate after evaluation:
- `candidate: OptimizedCandidate`
- `helper_functions: list[FunctionSource]`
- `code_context: CodeOptimizationContext`
- `runtime: int`
- `winning_behavior_test_results` / `winning_benchmarking_test_results: TestResults`
## Test Types
### `TestType` enum (`models/test_type.py`)
- `EXISTING_UNIT_TEST` (1) — Pre-existing tests from the codebase
- `INSPIRED_REGRESSION` (2) — Tests inspired by existing tests
- `GENERATED_REGRESSION` (3) — AI-generated regression tests
- `REPLAY_TEST` (4) — Tests from recorded benchmark data
- `CONCOLIC_COVERAGE_TEST` (5) — Coverage-guided tests
- `INIT_STATE_TEST` (6) — Class init state verification
### `TestFile` / `TestFiles` (`models/models.py`)
`TestFile` represents a single test file with `instrumented_behavior_file_path`, optional `benchmarking_file_path`, `original_file_path`, `test_type`, and `tests_in_file`.
`TestFiles` is a collection with lookup methods: `get_by_type()`, `get_by_original_file_path()`, `get_test_type_by_instrumented_file_path()`.
### `TestResults` (`models/models.py`)
Collection of `FunctionTestInvocation` results with indexed lookup. Key methods:
- `add(invocation)` — Deduplicated insert
- `total_passed_runtime()` — Sum of minimum runtimes per test case (nanoseconds)
- `number_of_loops()` — Max loop index across all results
- `usable_runtime_data_by_test_case()` — Dict of invocation ID → list of runtimes
## Result Type
### `Result[L, R]` / `Success` / `Failure` (`either.py`)
Functional error handling type:
- `Success(value)` — Wraps a successful result
- `Failure(error)` — Wraps an error
- `result.is_successful()` / `result.is_failure()` — Check type
- `result.unwrap()` — Get success value (raises if Failure)
- `result.failure()` — Get failure value (raises if Success)
- `is_successful(result)` — Module-level helper function

View file

@ -1,41 +0,0 @@
# Codeflash Internal Documentation
CodeFlash is an AI-powered Python code optimizer that automatically improves code performance while maintaining correctness. It uses LLMs to generate optimization candidates, verifies correctness through test execution, and benchmarks performance improvements.
## Pipeline Overview
```
Discovery → Ranking → Context Extraction → Test Gen + Optimization → Baseline → Candidate Evaluation → PR
```
1. **Discovery** (`discovery/`): Find optimizable functions across the codebase using `FunctionVisitor`
2. **Ranking** (`benchmarking/function_ranker.py`): Rank functions by addressable time using trace data
3. **Context** (`context/`): Extract code dependencies — split into read-writable (modifiable) and read-only (reference)
4. **Optimization** (`optimization/`, `api/`): Generate candidates via AI service, runs concurrently with test generation
5. **Verification** (`verification/`): Run candidates against tests via custom pytest plugin, compare outputs
6. **Benchmarking** (`benchmarking/`): Measure performance, select best candidate by speedup
7. **Result** (`result/`, `github/`): Create PR with winning optimization
## Key Entry Points
| Task | File |
|------|------|
| CLI arguments & commands | `cli_cmds/cli.py` |
| Optimization orchestration | `optimization/optimizer.py``Optimizer.run()` |
| Per-function optimization | `optimization/function_optimizer.py``FunctionOptimizer` |
| Function discovery | `discovery/functions_to_optimize.py` |
| Context extraction | `context/code_context_extractor.py` |
| Test execution | `verification/test_runner.py`, `verification/pytest_plugin.py` |
| Performance ranking | `benchmarking/function_ranker.py` |
| Domain types | `models/models.py`, `models/function_types.py` |
| AI service | `api/aiservice.py``AiServiceClient` |
| Configuration | `code_utils/config_consts.py` |
## Documentation Pages
- [Domain Types](domain-types.md) — Core data types and their relationships
- [Optimization Pipeline](optimization-pipeline.md) — Step-by-step data flow through the pipeline
- [Context Extraction](context-extraction.md) — How code context is extracted and token-limited
- [Verification](verification.md) — Test execution, pytest plugin, deterministic patches
- [AI Service](ai-service.md) — AI service client endpoints and request types
- [Configuration](configuration.md) — Config schema, effort levels, thresholds

View file

@ -1,84 +0,0 @@
# Optimization Pipeline
Step-by-step data flow from function discovery to PR creation.
## 1. Entry Point: `Optimizer.run()` (`optimization/optimizer.py`)
The `Optimizer` class is initialized with CLI args and creates:
- `TestConfig` with test roots, project root, pytest command
- `AiServiceClient` for AI service communication
- Optional `LocalAiServiceClient` for experiments
`run()` orchestrates the full pipeline: discovers functions, optionally ranks them, then optimizes each in turn.
## 2. Function Discovery (`discovery/functions_to_optimize.py`)
`FunctionVisitor` traverses source files to find optimizable functions, producing `FunctionToOptimize` instances. Filters include:
- Skipping functions that are too small or trivial
- Skipping previously optimized functions (via `was_function_previously_optimized()`)
- Applying user-configured include/exclude patterns
## 3. Function Ranking (`benchmarking/function_ranker.py`)
When trace data is available, `FunctionRanker` ranks functions by **addressable time** — the time a function spends that could be optimized (own time + callee time / call count). Functions below `DEFAULT_IMPORTANCE_THRESHOLD=0.001` are skipped.
## 4. Per-Function Optimization: `FunctionOptimizer` (`optimization/function_optimizer.py`)
For each function, `FunctionOptimizer.optimize_function()` runs the full optimization loop:
### 4a. Context Extraction (`context/code_context_extractor.py`)
Extracts `CodeOptimizationContext` containing:
- `read_writable_code` — Code the LLM can modify (the function + helpers)
- `read_only_context_code` — Dependency code for reference only
- `testgen_context` — Context for test generation (may include imported class definitions)
Token limits are enforced: `OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000` and `TESTGEN_CONTEXT_TOKEN_LIMIT=16000`. Functions exceeding these are rejected.
### 4b. Concurrent Test Generation + LLM Optimization
These run in parallel using `concurrent.futures`:
- **Test generation**: Generates regression tests from the function context
- **LLM optimization**: Sends `read_writable_code.markdown` + `read_only_context_code` to the AI service
The number of candidates depends on effort level (see Configuration docs).
### 4c. Candidate Evaluation
For each `OptimizedCandidate`:
1. **Deduplication**: Normalize code AST and check against `CandidateEvaluationContext.ast_code_to_id`. If duplicate, copy results from previous evaluation.
2. **Code replacement**: Replace the original function with the candidate using `replace_function_definitions_in_module()`.
3. **Behavioral testing**: Run instrumented tests in subprocess. The custom pytest plugin applies deterministic patches. Compare return values, stdout, and pass/fail status against the original baseline.
4. **Benchmarking**: If behavior matches, run performance tests with looping (`TOTAL_LOOPING_TIME=10s`). Calculate speedup ratio.
5. **Validation**: Candidate must beat `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup) and pass stability checks.
### 4d. Refinement & Repair
- **Repair**: If fewer than `MIN_CORRECT_CANDIDATES=2` pass, failed candidates can be repaired via `AIServiceCodeRepairRequest` (sends test diffs to LLM).
- **Refinement**: Top valid candidates are refined via `AIServiceRefinerRequest` (sends runtime data, line profiler results).
- **Adaptive**: At HIGH effort, additional adaptive optimization rounds via `AIServiceAdaptiveOptimizeRequest`.
### 4e. Best Candidate Selection
The winning candidate is selected by:
1. Highest speedup ratio
2. For tied speedups, shortest diff length from original
3. Refinement candidates use weighted ranking: `(2 * runtime_rank + 1 * diff_rank)`
Result is a `BestOptimization` with the candidate, context, test results, and runtime.
## 5. PR Creation (`github/`)
If a winning candidate is found, a PR is created with:
- The optimized code diff
- Performance benchmark details
- Explanation from the LLM
## Worktree Mode
When `--worktree` is enabled, optimization runs in an isolated git worktree (`code_utils/git_worktree_utils.py`). This allows parallel optimization without affecting the working tree. Changes are captured as patch files.

View file

@ -1,93 +0,0 @@
# Verification
How codeflash verifies candidate correctness and measures performance.
## Test Execution Architecture
Tests are executed in a **subprocess** to isolate the test environment from the main codeflash process. The test runner (`verification/test_runner.py`) invokes pytest (or Jest for JS/TS) with specific plugin configurations.
### Plugin Blocklists
- **Behavioral tests**: Block `benchmark`, `codspeed`, `xdist`, `sugar`
- **Benchmarking tests**: Block `codspeed`, `cov`, `benchmark`, `profiling`, `xdist`, `sugar`
These are defined as `BEHAVIORAL_BLOCKLISTED_PLUGINS` and `BENCHMARKING_BLOCKLISTED_PLUGINS` in `verification/test_runner.py`.
## Custom Pytest Plugin (`verification/pytest_plugin.py`)
The plugin is loaded into the test subprocess and provides:
### Deterministic Patches
`_apply_deterministic_patches()` replaces non-deterministic functions with fixed values to ensure reproducible test output:
| Module | Function | Fixed Value |
|--------|----------|-------------|
| `time` | `time()` | `1761717605.108106` |
| `time` | `perf_counter()` | Incrementing by 1ms per call |
| `datetime` | `datetime.now()` | `2021-01-01 02:05:10 UTC` |
| `datetime` | `datetime.utcnow()` | `2021-01-01 02:05:10 UTC` |
| `uuid` | `uuid4()` / `uuid1()` | `12345678-1234-5678-9abc-123456789012` |
| `random` | `random()` | `0.123456789` (seeded with 42) |
| `os` | `urandom(n)` | `b"\x42" * n` |
| `numpy.random` | seed | `42` |
Patches call the original function first to maintain performance characteristics (same call overhead).
### Timing Markers
Test results include timing markers in stdout: `!######<id>:<duration_ns>######!`
The pattern `_TIMING_MARKER_PATTERN` extracts timing data for calculating function utilization fraction.
### Loop Stability
Performance benchmarking uses configurable stability thresholds:
- `STABILITY_WINDOW_SIZE = 0.35` (35% of total iterations)
- `STABILITY_CENTER_TOLERANCE = 0.0025` (±0.25% around median)
- `STABILITY_SPREAD_TOLERANCE = 0.0025` (0.25% window spread)
### Memory Limits (Linux)
On Linux, the plugin sets `RLIMIT_AS` to 85% of total system memory (RAM + swap) to prevent OOM kills.
## Test Result Processing
### `TestResults` (`models/models.py`)
Collects `FunctionTestInvocation` results with:
- Deduplicated insertion via `unique_invocation_loop_id`
- `total_passed_runtime()` — Sum of minimum runtimes per test case (nanoseconds)
- `number_of_loops()` — Max loop index
- `usable_runtime_data_by_test_case()` — Grouped timing data
### `FunctionTestInvocation`
Each invocation records:
- `loop_index` — Iteration number (starts at 1)
- `id: InvocationId` — Fully qualified test identifier
- `did_pass: bool` — Pass/fail status
- `runtime: Optional[int]` — Time in nanoseconds
- `return_value: Optional[object]` — Captured return value
- `test_type: TestType` — Which test category
### Behavioral vs Performance Testing
1. **Behavioral**: Runs with `TestingMode.BEHAVIOR`. Compares return values and stdout between original and candidate. Any difference = candidate rejected.
2. **Performance**: Runs with `TestingMode.PERFORMANCE`. Loops for `TOTAL_LOOPING_TIME=10s` to get stable timing. Calculates speedup ratio.
3. **Line Profile**: Runs with `TestingMode.LINE_PROFILE`. Collects per-line timing data for refinement.
## Test Types
| TestType | Value | Description |
|----------|-------|-------------|
| `EXISTING_UNIT_TEST` | 1 | Pre-existing tests from the codebase |
| `INSPIRED_REGRESSION` | 2 | Tests inspired by existing tests |
| `GENERATED_REGRESSION` | 3 | AI-generated regression tests |
| `REPLAY_TEST` | 4 | Tests from recorded benchmark data |
| `CONCOLIC_COVERAGE_TEST` | 5 | Coverage-guided tests |
| `INIT_STATE_TEST` | 6 | Class init state verification |
## Coverage
Coverage is measured via `CoverageData` with a threshold of `COVERAGE_THRESHOLD=60.0%`. Low coverage may affect confidence in the optimization's correctness.

View file

@ -1,118 +0,0 @@
{
"package_name": "codeflash-docs",
"total_capabilities": 16,
"capabilities": [
{
"id": 0,
"name": "pipeline-stage-ordering",
"description": "Know the correct ordering of codeflash pipeline stages: Discovery → Ranking → Context Extraction → Test Gen + Optimization (concurrent) → Baseline → Candidate Evaluation → PR",
"complexity": "basic",
"api_elements": ["Optimizer.run()", "FunctionOptimizer.optimize_function()"]
},
{
"id": 1,
"name": "function-to-optimize-fields",
"description": "Know FunctionToOptimize key fields (function_name, file_path, parents, starting_line/ending_line, is_async, is_method, language) and properties (qualified_name, top_level_parent_name, class_name)",
"complexity": "intermediate",
"api_elements": ["FunctionToOptimize", "FunctionParent", "models/function_types.py"]
},
{
"id": 2,
"name": "code-strings-markdown-format",
"description": "Know that code is serialized as markdown fenced blocks with language:filepath syntax (```python:filepath\\ncode\\n```) and parsed via CodeStringsMarkdown.parse_markdown_code()",
"complexity": "intermediate",
"api_elements": ["CodeStringsMarkdown", "CodeString", ".markdown", ".flat", "parse_markdown_code()"]
},
{
"id": 3,
"name": "read-writable-vs-read-only",
"description": "Distinguish read_writable_code (LLM can modify) from read_only_context_code (reference only) in CodeOptimizationContext",
"complexity": "basic",
"api_elements": ["CodeOptimizationContext", "read_writable_code", "read_only_context_code"]
},
{
"id": 4,
"name": "candidate-source-types",
"description": "Know OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE and when each is used",
"complexity": "intermediate",
"api_elements": ["OptimizedCandidateSource", "OptimizedCandidate"]
},
{
"id": 5,
"name": "candidate-forest-dag",
"description": "Know that candidates form a forest/DAG via parent_id references where refinements and repairs build on previous candidates",
"complexity": "intermediate",
"api_elements": ["parent_id", "OptimizedCandidate", "CandidateForest"]
},
{
"id": 6,
"name": "concurrent-testgen-optimization",
"description": "Know that test generation and LLM optimization run concurrently using concurrent.futures, not sequentially",
"complexity": "intermediate",
"api_elements": ["concurrent.futures", "FunctionOptimizer.optimize_function()"]
},
{
"id": 7,
"name": "deterministic-patch-values",
"description": "Know the specific fixed values used by deterministic patches: time=1761717605.108106, datetime=2021-01-01 02:05:10 UTC, uuid=12345678-1234-5678-9abc-123456789012, random seeded with 42",
"complexity": "advanced",
"api_elements": ["_apply_deterministic_patches()", "pytest_plugin.py"]
},
{
"id": 8,
"name": "test-type-enum",
"description": "Know the 6 TestType variants: EXISTING_UNIT_TEST, INSPIRED_REGRESSION, GENERATED_REGRESSION, REPLAY_TEST, CONCOLIC_COVERAGE_TEST, INIT_STATE_TEST",
"complexity": "basic",
"api_elements": ["TestType", "models/test_type.py"]
},
{
"id": 9,
"name": "ai-service-endpoints",
"description": "Know the AI service endpoints: /ai/optimize, /ai/optimize_line_profiler, /ai/refine, /ai/repair, /ai/adaptive_optimize, /ai/rewrite_jit",
"complexity": "intermediate",
"api_elements": ["AiServiceClient", "api/aiservice.py"]
},
{
"id": 10,
"name": "repair-request-structure",
"description": "Know that AIServiceCodeRepairRequest includes TestDiff objects with scope (RETURN_VALUE/STDOUT/DID_PASS), original vs candidate values, and test source code",
"complexity": "advanced",
"api_elements": ["AIServiceCodeRepairRequest", "TestDiff", "TestDiffScope"]
},
{
"id": 11,
"name": "effort-level-values",
"description": "Know specific effort level values: LOW gets 3 candidates, MEDIUM gets 5, HIGH gets 6 (N_OPTIMIZER_CANDIDATES)",
"complexity": "intermediate",
"api_elements": ["EffortLevel", "N_OPTIMIZER_CANDIDATES", "EFFORT_VALUES"]
},
{
"id": 12,
"name": "context-token-limits",
"description": "Know OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000 and TESTGEN_CONTEXT_TOKEN_LIMIT=16000 and that encoded_tokens_len() is used for counting",
"complexity": "basic",
"api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"]
},
{
"id": 13,
"name": "best-candidate-selection",
"description": "Know the selection criteria: highest speedup, then shortest diff for ties, and refinement weighted ranking (2*runtime + 1*diff)",
"complexity": "advanced",
"api_elements": ["BestOptimization", "REFINED_CANDIDATE_RANKING_WEIGHTS"]
},
{
"id": 14,
"name": "plugin-blocklists",
"description": "Know behavioral test blocklisted plugins (benchmark, codspeed, xdist, sugar) and benchmarking blocklist (adds cov, profiling)",
"complexity": "intermediate",
"api_elements": ["BEHAVIORAL_BLOCKLISTED_PLUGINS", "BENCHMARKING_BLOCKLISTED_PLUGINS"]
},
{
"id": 15,
"name": "result-type-usage",
"description": "Know that Result[L,R] from either.py uses Success(value)/Failure(error) with is_successful() check before unwrap()",
"complexity": "basic",
"api_elements": ["Result", "Success", "Failure", "is_successful", "either.py"]
}
]
}

View file

@ -1 +0,0 @@
Code serialization format and context splitting

View file

@ -1,21 +0,0 @@
{
"context": "Tests whether the agent knows the CodeStringsMarkdown serialization format and the distinction between read-writable and read-only code context in the codeflash pipeline.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Markdown code block format",
"description": "Uses the correct fenced code block format with language:filepath syntax (```python:path/to/file.py) when constructing code for the AI service, NOT plain code blocks without file paths",
"max_score": 30
},
{
"name": "Read-writable vs read-only split",
"description": "Correctly separates code into read_writable_code (code the LLM can modify) and read_only_context_code (reference-only dependency code), NOT treating all code as modifiable",
"max_score": 35
},
{
"name": "parse_markdown_code usage",
"description": "Uses CodeStringsMarkdown.parse_markdown_code() to parse AI service responses back into structured code, NOT manual string splitting or regex",
"max_score": 35
}
]
}

View file

@ -1,35 +0,0 @@
# Format Code for AI Service Request
## Context
You are working on the codeflash optimization engine. The AI service accepts optimization requests with source code and dependency context. A function `calculate_total` in `analytics/metrics.py` needs to be optimized. It calls a helper `normalize_values` in the same file (both modifiable), and imports `BaseMetric` from `analytics/base.py` (not modifiable, just for reference).
```python
# analytics/metrics.py
from analytics.base import BaseMetric
def normalize_values(data: list[float]) -> list[float]:
max_val = max(data)
return [x / max_val for x in data]
def calculate_total(metrics: list[BaseMetric]) -> float:
values = [m.value for m in metrics]
normalized = normalize_values(values)
return sum(normalized)
```
```python
# analytics/base.py
class BaseMetric:
def __init__(self, name: str, value: float):
self.name = name
self.value = value
```
## Task
Write a Python function `prepare_optimization_payload` that constructs the code payload for an AI service optimization request for `calculate_total`. It should properly format the source code and dependency code, and include a function to parse the AI service response back into structured code objects.
## Expected Outputs
- A Python file `payload_builder.py` with the payload construction and response parsing logic

View file

@ -1 +0,0 @@
Candidate source types and DAG relationships

View file

@ -1,26 +0,0 @@
{
"context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Lists source types",
"description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE",
"max_score": 25
},
{
"name": "Parent ID linkage",
"description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates",
"max_score": 25
},
{
"name": "Refinement uses runtime data",
"description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code",
"max_score": 25
},
{
"name": "Repair uses test diffs",
"description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages",
"max_score": 25
}
]
}

View file

@ -1,13 +0,0 @@
# Document the Candidate Lifecycle
## Context
A new engineer is joining the codeflash team and needs to understand how optimization candidates are generated, improved, and related to each other throughout the pipeline. They've asked for a clear explanation of the different ways candidates are produced and how the system iterates on them.
## Task
Write a technical document explaining the full lifecycle of an optimization candidate in codeflash — from initial generation through improvement iterations. Cover all the different ways candidates can be created, what data is sent to the AI service for each type, and how candidates relate to each other structurally.
## Expected Outputs
- A markdown file `candidate-lifecycle.md`

View file

@ -1 +0,0 @@
Deterministic patch values and test execution architecture

View file

@ -1,31 +0,0 @@
{
"context": "Tests whether the agent knows the specific deterministic patch values used in codeflash's pytest plugin and the subprocess-based test execution architecture.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Subprocess isolation",
"description": "States that tests run in a subprocess to isolate the test environment from the main codeflash process, NOT in the same process",
"max_score": 20
},
{
"name": "Fixed time value",
"description": "References the specific fixed timestamp 1761717605.108106 for time.time() or the fixed datetime 2021-01-01 02:05:10 UTC for datetime.now()",
"max_score": 20
},
{
"name": "Fixed UUID value",
"description": "References the specific fixed UUID 12345678-1234-5678-9abc-123456789012 for uuid4/uuid1",
"max_score": 20
},
{
"name": "Random seed",
"description": "States that random is seeded with 42 (NOT a different seed value)",
"max_score": 20
},
{
"name": "Plugin blocklists",
"description": "Mentions that behavioral tests block specific pytest plugins (at least 2 of: benchmark, codspeed, xdist, sugar) to ensure deterministic execution",
"max_score": 20
}
]
}

View file

@ -1,13 +0,0 @@
# Explain Test Reproducibility Guarantees
## Context
A codeflash user notices that their optimization candidate passes behavioral tests on one run but fails on the next. They suspect non-determinism in the test execution. They want to understand what guarantees codeflash provides for test reproducibility and how the system ensures consistent results.
## Task
Write a technical explanation of how codeflash ensures deterministic test execution. Cover the execution environment setup, what sources of non-determinism are controlled, and any specific values or configurations used. Also explain the test execution architecture.
## Expected Outputs
- A markdown file `test-reproducibility.md`

View file

@ -1 +0,0 @@
Effort level configuration and candidate selection criteria

View file

@ -1,26 +0,0 @@
{
"context": "Tests whether the agent knows the specific effort level values for candidate generation and the criteria used to select the best optimization candidate.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Candidate counts by effort",
"description": "States correct N_OPTIMIZER_CANDIDATES values: LOW=3, MEDIUM=5, HIGH=6 (at least 2 of 3 correct)",
"max_score": 25
},
{
"name": "Speedup as primary selector",
"description": "States that the winning candidate is selected primarily by highest speedup ratio",
"max_score": 25
},
{
"name": "Diff length as tiebreaker",
"description": "States that for tied speedups, shortest diff length from original is used as tiebreaker",
"max_score": 25
},
{
"name": "Refinement ranking weights",
"description": "States that refinement candidates use weighted ranking with runtime weighted more heavily than diff (2:1 ratio or REFINED_CANDIDATE_RANKING_WEIGHTS=(2,1))",
"max_score": 25
}
]
}

View file

@ -1,18 +0,0 @@
# Design a Candidate Selection Dashboard
## Context
The codeflash team wants to build a dashboard that shows users how optimization candidates were evaluated and why a particular candidate won. The dashboard needs to display the selection process at each stage, from initial candidate pool through to the final winner.
## Task
Write a specification document for the dashboard that explains:
1. How many candidates are generated at each effort level
2. The exact criteria and order of operations used to pick the winning candidate
3. How refinement candidates are ranked differently from initial candidates
Include concrete examples showing how two hypothetical candidates would be compared.
## Expected Outputs
- A markdown file `selection-dashboard-spec.md`

View file

@ -1 +0,0 @@
Pipeline concurrency and FunctionToOptimize structure

View file

@ -1,26 +0,0 @@
{
"context": "Tests whether the agent knows the FunctionToOptimize data structure and the concurrent execution model for test generation and optimization.",
"type": "weighted_checklist",
"checklist": [
{
"name": "FunctionToOptimize fields",
"description": "Includes at least 4 of: function_name, file_path, parents (list of FunctionParent), starting_line, ending_line, is_async, is_method, language",
"max_score": 25
},
{
"name": "Qualified name property",
"description": "Mentions qualified_name as a property that produces the full dotted name including parent classes (e.g., MyClass.my_method)",
"max_score": 25
},
{
"name": "Concurrent execution",
"description": "States that test generation and LLM optimization run concurrently (in parallel), NOT sequentially one after the other",
"max_score": 25
},
{
"name": "Entry point identification",
"description": "Correctly identifies Optimizer.run() as the top-level entry point and FunctionOptimizer.optimize_function() as the per-function entry point",
"max_score": 25
}
]
}

View file

@ -1,17 +0,0 @@
# Implement a Function Optimization Status Tracker
## Context
The codeflash team needs a status tracker that logs what happens to each function during an optimization run. For each function, it should record the function identity, which pipeline stages it passed through, and how long each stage took.
## Task
Write a design document explaining:
1. What data structure represents a function being optimized, including its identity fields and how nested functions (methods inside classes) are represented
2. The full name resolution strategy for identifying functions uniquely
3. Which stages of the pipeline operate on a single function at a time vs. operating on multiple functions
4. Where in the codebase the per-function optimization is orchestrated and what the top-level entry point is
## Expected Outputs
- A markdown file `status-tracker-design.md`

View file

@ -1,40 +0,0 @@
{
"total_scenarios": 5,
"capabilities_coverage": {
"total_capabilities": 16,
"capabilities_tested": 12,
"coverage_percentage": 75.0
},
"complexity_distribution": {
"basic": 1,
"intermediate": 3,
"advanced": 1
},
"scenarios": [
{
"index": 1,
"capability": "code-strings-markdown-format, read-writable-vs-read-only",
"complexity": "intermediate"
},
{
"index": 2,
"capability": "candidate-source-types, candidate-forest-dag, repair-request-structure",
"complexity": "intermediate"
},
{
"index": 3,
"capability": "deterministic-patch-values, plugin-blocklists",
"complexity": "advanced"
},
{
"index": 4,
"capability": "effort-level-values, best-candidate-selection",
"complexity": "intermediate"
},
{
"index": 5,
"capability": "function-to-optimize-fields, concurrent-testgen-optimization, pipeline-stage-ordering",
"complexity": "basic"
}
]
}

View file

@ -1,25 +0,0 @@
{
"total_infeasible": 4,
"infeasible_capabilities": [
{
"capability": "ai-service-endpoints",
"complexity": "intermediate",
"reasoning": "Testing knowledge of specific API endpoints requires actual HTTP requests or mocking that bypasses the capability being tested"
},
{
"capability": "context-token-limits",
"complexity": "basic",
"reasoning": "Already covered by the skills tile eval (scenario-1). Testing token counting requires the actual tokenizer library"
},
{
"capability": "test-type-enum",
"complexity": "basic",
"reasoning": "Simple enum knowledge is better verified through skills that use test types rather than isolated recall"
},
{
"capability": "result-type-usage",
"complexity": "basic",
"reasoning": "Already covered by the skills tile eval (scenario-2). Testing Result type usage is better done through implementation tasks"
}
]
}

View file

@ -1,7 +0,0 @@
{
"name": "codeflash/codeflash-docs",
"version": "0.1.0",
"summary": "Internal documentation for the codeflash optimization engine",
"private": true,
"docs": "docs/index.md"
}

View file

@ -1,45 +0,0 @@
# Architecture
```
codeflash/
├── main.py # CLI entry point
├── cli_cmds/ # Command handling, console output (Rich)
├── discovery/ # Find optimizable functions
├── context/ # Extract code dependencies and imports
├── optimization/ # Generate optimized code via AI
│ ├── optimizer.py # Main optimization orchestration
│ └── function_optimizer.py # Per-function optimization logic
├── verification/ # Run deterministic tests (pytest plugin)
├── benchmarking/ # Performance measurement
├── github/ # PR creation
├── api/ # AI service communication
├── code_utils/ # Code parsing, git utilities
├── models/ # Pydantic models and types
├── languages/ # Multi-language support (Python, JavaScript/TypeScript)
├── setup/ # Config schema, auto-detection, first-run experience
├── picklepatch/ # Serialization/deserialization utilities
├── tracing/ # Function call tracing
├── tracer.py # Root-level tracer entry point for profiling
├── lsp/ # IDE integration (Language Server Protocol)
├── telemetry/ # Sentry, PostHog
├── either.py # Functional Result type for error handling
├── result/ # Result types and handling
└── version.py # Version information
```
## Key Entry Points
| Task | Start here |
|------|------------|
| CLI arguments & commands | `cli_cmds/cli.py` |
| Optimization orchestration | `optimization/optimizer.py``Optimizer.run()` |
| Per-function optimization | `optimization/function_optimizer.py``FunctionOptimizer` |
| Function discovery | `discovery/functions_to_optimize.py` |
| Context extraction | `context/code_context_extractor.py` |
| Test execution | `verification/test_runner.py`, `verification/pytest_plugin.py` |
| Performance ranking | `benchmarking/function_ranker.py` |
| Domain types | `models/models.py`, `models/function_types.py` |
| Result handling | `either.py` (`Result`, `Success`, `Failure`, `is_successful`) |
| AI service communication | `api/aiservice.py``AiServiceClient` |
| Configuration constants | `code_utils/config_consts.py` |
| Language support | `languages/registry.py``get_language_support()` |

View file

@ -1,11 +0,0 @@
# Code Style
- **Line length**: 120 characters
- **Python**: 3.9+ syntax (use `from __future__ import annotations` for type hints)
- **Package management**: Always use `uv`, never `pip` — run commands via `uv run`
- **Tooling**: Ruff for linting/formatting, mypy strict mode, prek for pre-commit checks (`uv run prek run`)
- **Comments**: Minimal — only explain "why", not "what"
- **Docstrings**: Do not add unless explicitly requested
- **Naming**: NEVER use leading underscores (`_function_name`) — Python has no true private functions, use public names
- **Paths**: Always use absolute `Path` objects, handle encoding explicitly (UTF-8)
- **Source transforms**: Use `libcst` for code modification/transformation to preserve formatting; `ast` is acceptable for read-only analysis and parsing

View file

@ -1,9 +0,0 @@
# Git Conventions
- **Always create a new branch from `main`** — never commit directly to `main` or reuse an existing feature branch for unrelated changes
- Use conventional commit format: `fix:`, `feat:`, `refactor:`, `docs:`, `test:`, `chore:`
- Keep commits atomic — one logical change per commit
- Commit message body should be concise (1-2 sentences max)
- PR titles should also use conventional format
- Branch naming: `cf-#-title` (lowercase, hyphenated) where `#` is the Linear issue number
- If related to a Linear issue, include `CF-#` in the PR body

View file

@ -1,9 +0,0 @@
# Language Support Rules
- Current language is a module-level singleton in `languages/current.py` — use `set_current_language()` / `current_language()`, never pass language as a parameter through call chains
- Use `get_language_support(identifier)` from `languages/registry.py` to get a `LanguageSupport` instance — accepts `Path`, `Language` enum, or string; never import language classes directly
- New language support classes must use the `@register_language` decorator to register with the extension and language registries
- `languages/__init__.py` uses `__getattr__` for lazy imports to avoid circular dependencies — follow this pattern when adding new exports
- `is_javascript()` returns `True` for both JavaScript and TypeScript
- Language modules are lazily imported on first `get_language_support()` call via `_ensure_languages_registered()` — the `@register_language` decorator fires on import and populates `_EXTENSION_REGISTRY` and `_LANGUAGE_REGISTRY`
- `LanguageSupport` instances are cached in `_SUPPORT_CACHE` — use `clear_cache()` only in tests

View file

@ -1,11 +0,0 @@
# Optimization Pipeline Patterns
- All major operations return `Result[SuccessType, ErrorType]` — construct with `Success(value)` / `Failure(error)`, check with `is_successful()` before calling `unwrap()`
- Code context has token limits (`OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000`, `TESTGEN_CONTEXT_TOKEN_LIMIT=16000` in `code_utils/config_consts.py`) — exceeding them rejects the function
- `read_writable_code` (modifiable code) can span multiple files; `read_only_context_code` is reference-only dependency code
- Code is serialized as markdown code blocks: `` ```language:filepath\ncode\n``` `` — see `CodeStringsMarkdown` in `models/models.py`
- Candidates form a forest (DAG): refinements/repairs reference `parent_id` on previous candidates via `OptimizedCandidateSource` (OPTIMIZE, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE)
- Test generation and optimization run concurrently — coordinate through `CandidateEvaluationContext`
- Generated tests are instrumented with `codeflash_capture.py` to record return values and traces
- Minimum improvement threshold is 5% (`MIN_IMPROVEMENT_THRESHOLD=0.05`) — candidates below this are rejected
- Stability thresholds: `STABILITY_WINDOW_SIZE=0.35`, `STABILITY_CENTER_TOLERANCE=0.0025`, `STABILITY_SPREAD_TOLERANCE=0.0025`

View file

@ -1,13 +0,0 @@
# Testing Rules
- Code context extraction and replacement tests must assert full string equality — no substring matching
- Use pytest's `tmp_path` fixture for temp directories (it's a `Path` object)
- Write temp files inside `tmp_path`, never use `NamedTemporaryFile` (causes Windows file contention)
- Always call `.resolve()` on Path objects to ensure absolute paths and resolve symlinks
- Use `.as_posix()` when converting resolved paths to strings (normalizes to forward slashes)
- Any new feature or bug fix that can be tested automatically must have test cases
- If changes affect existing test expectations, update the tests accordingly — tests must always pass after changes
- The pytest plugin patches `time`, `random`, `uuid`, `datetime`, `os.urandom`, and `numpy.random` for deterministic test execution — never assume real randomness or real time in verification tests
- `conftest.py` uses an autouse fixture that calls `reset_current_language()` — tests always start with Python as the default language
- Test types are defined by the `TestType` enum: `EXISTING_UNIT_TEST`, `INSPIRED_REGRESSION`, `GENERATED_REGRESSION`, `REPLAY_TEST`, `CONCOLIC_COVERAGE_TEST`, `INIT_STATE_TEST`
- Verification runs tests in a subprocess using a custom pytest plugin (`verification/pytest_plugin.py`) — behavioral tests use blocklisted plugins (`benchmark`, `codspeed`, `xdist`, `sugar`), benchmarking tests additionally block `cov` and `profiling`

View file

@ -1,26 +0,0 @@
{
"name": "codeflash/codeflash-rules",
"version": "0.1.0",
"summary": "Coding standards and conventions for the codeflash codebase",
"private": true,
"rules": {
"code-style": {
"rules": "rules/code-style.md"
},
"architecture": {
"rules": "rules/architecture.md"
},
"optimization-patterns": {
"rules": "rules/optimization-patterns.md"
},
"git-conventions": {
"rules": "rules/git-conventions.md"
},
"testing-rules": {
"rules": "rules/testing-rules.md"
},
"language-rules": {
"rules": "rules/language-rules.md"
}
}
}

View file

@ -1,104 +0,0 @@
{
"package_name": "codeflash-skills",
"total_capabilities": 14,
"capabilities": [
{
"id": 0,
"name": "sequential-pipeline-debugging",
"description": "Debug optimization failures by walking through pipeline stages sequentially and stopping at the first failure found",
"complexity": "intermediate",
"api_elements": ["discovery", "ranking", "context", "AI service", "verification", "deduplication", "repair"]
},
{
"id": 1,
"name": "token-limit-awareness",
"description": "Know that OPTIMIZATION_CONTEXT_TOKEN_LIMIT and TESTGEN_CONTEXT_TOKEN_LIMIT are both 16000 tokens and that exceeding them causes function rejection",
"complexity": "basic",
"api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"]
},
{
"id": 2,
"name": "improvement-threshold",
"description": "Know that MIN_IMPROVEMENT_THRESHOLD is 0.05 (5%) and candidates below this speedup are rejected",
"complexity": "basic",
"api_elements": ["MIN_IMPROVEMENT_THRESHOLD", "STABILITY_WINDOW_SIZE"]
},
{
"id": 3,
"name": "ast-deduplication",
"description": "Know that candidates are deduplicated via AST normalization using normalize_code() and CandidateEvaluationContext.ast_code_to_id",
"complexity": "intermediate",
"api_elements": ["normalize_code()", "CandidateEvaluationContext.ast_code_to_id", "code_utils/deduplicate_code.py"]
},
{
"id": 4,
"name": "repair-trigger-conditions",
"description": "Know that repair only triggers when fewer than MIN_CORRECT_CANDIDATES=2 pass, and is skipped when REPAIR_UNMATCHED_PERCENTAGE_LIMIT is exceeded",
"complexity": "advanced",
"api_elements": ["MIN_CORRECT_CANDIDATES", "REPAIR_UNMATCHED_PERCENTAGE_LIMIT", "AIServiceCodeRepairRequest"]
},
{
"id": 5,
"name": "ai-service-error-patterns",
"description": "Know specific log patterns to search for when AI service fails: 'Error generating optimized candidates', 'cli-optimize-error-caught', 'cli-optimize-error-response'",
"complexity": "intermediate",
"api_elements": ["AiServiceClient", "api/aiservice.py"]
},
{
"id": 6,
"name": "behavioral-vs-benchmark-failures",
"description": "Distinguish between behavioral test failures (return value/stdout/pass-fail mismatches via TestDiffScope) and benchmark failures (speedup below threshold)",
"complexity": "intermediate",
"api_elements": ["TestDiffScope", "RETURN_VALUE", "STDOUT", "DID_PASS"]
},
{
"id": 7,
"name": "result-type-pattern",
"description": "Use Result[L, R] from either.py with Success/Failure constructors and is_successful() checks before unwrap()",
"complexity": "basic",
"api_elements": ["Result", "Success", "Failure", "is_successful", "unwrap()", "either.py"]
},
{
"id": 8,
"name": "effort-config-pattern",
"description": "Add effort-dependent config via EffortKeys enum, EFFORT_VALUES dict with LOW/MEDIUM/HIGH levels, and get_effort_value()",
"complexity": "intermediate",
"api_elements": ["EffortKeys", "EffortLevel", "EFFORT_VALUES", "get_effort_value()", "config_consts.py"]
},
{
"id": 9,
"name": "module-to-feature-mapping",
"description": "Know which codeflash module to modify for different feature types (optimization/ for strategies, api/ for endpoints, languages/ for language support, etc.)",
"complexity": "basic",
"api_elements": ["MODULE_REFERENCE.md"]
},
{
"id": 10,
"name": "domain-type-conventions",
"description": "Use @dataclass(frozen=True) for immutable data, BaseModel for serializable models, and keep function_types.py dependency-free",
"complexity": "intermediate",
"api_elements": ["@dataclass(frozen=True)", "BaseModel", "models/models.py", "models/function_types.py"]
},
{
"id": 11,
"name": "test-patterns",
"description": "Use tmp_path fixture, .resolve() on Paths, .as_posix() for string conversion, full string equality assertions, and awareness of deterministic patches",
"complexity": "basic",
"api_elements": ["tmp_path", ".resolve()", ".as_posix()", "pytest_plugin.py"]
},
{
"id": 12,
"name": "quality-check-commands",
"description": "Run uv run prek run for formatting/linting, uv run mypy for type checking, and uv run pytest for tests",
"complexity": "basic",
"api_elements": ["uv run prek run", "uv run mypy", "uv run pytest"]
},
{
"id": 13,
"name": "language-support-patterns",
"description": "Use @register_language decorator, get_language_support() for lookup, singleton pattern via set_current_language()/current_language(), and is_python()/is_javascript() guards",
"complexity": "advanced",
"api_elements": ["@register_language", "get_language_support()", "set_current_language()", "is_python()", "is_javascript()"]
}
]
}

View file

@ -1 +0,0 @@
Sequential pipeline debugging with specific thresholds

View file

@ -1,26 +0,0 @@
{
"context": "Tests whether the agent follows the sequential debugging workflow from the skill, checking pipeline stages in order and using correct threshold values when diagnosing an optimization that produced no results.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Sequential stage order",
"description": "Investigates pipeline stages in order: discovery before ranking before context before AI service before test failures. Does NOT jump to later stages without checking earlier ones first.",
"max_score": 25
},
{
"name": "Token limit value",
"description": "References the specific token limit of 16000 for OPTIMIZATION_CONTEXT_TOKEN_LIMIT or TESTGEN_CONTEXT_TOKEN_LIMIT when checking context extraction",
"max_score": 25
},
{
"name": "Importance threshold",
"description": "References DEFAULT_IMPORTANCE_THRESHOLD=0.001 when checking function ranking",
"max_score": 25
},
{
"name": "Stops at failure",
"description": "Identifies the failing stage and focuses investigation there rather than continuing through all remaining stages",
"max_score": 25
}
]
}

View file

@ -1,13 +0,0 @@
# Diagnose Silent Optimization Skip
## Context
A user reports that when running codeflash on their project, a specific function `calculate_metrics` in `analytics/processor.py` never appears in the optimization results. The function exists in the module root, is not in the exclude list, and has not been previously optimized. Trace data shows the function is called frequently but with very short execution times (averaging 0.0005 seconds total addressable time). The function has moderate dependencies.
## Task
Write a diagnostic report explaining why this function is being skipped and at which stage in the pipeline the function is filtered out. Include the specific threshold or condition that causes the skip.
## Expected Outputs
A markdown file `diagnostic-report.md` explaining the root cause.

View file

@ -1 +0,0 @@
Result type pattern and effort-dependent configuration

View file

@ -1,31 +0,0 @@
{
"context": "Tests whether the agent uses the codeflash Result type pattern from either.py and the effort-dependent configuration pattern when implementing a new pipeline feature.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Imports from either.py",
"description": "Imports Success, Failure, and is_successful from codeflash.either (NOT from a different error handling module)",
"max_score": 20
},
{
"name": "Result return type",
"description": "Function returns Result type using Success() for success and Failure() for errors, not exceptions or None",
"max_score": 20
},
{
"name": "is_successful check",
"description": "Calls is_successful() or .is_successful() before calling unwrap() on the result",
"max_score": 20
},
{
"name": "EffortKeys enum entry",
"description": "Adds a new entry to the EffortKeys enum in config_consts.py",
"max_score": 20
},
{
"name": "Three effort levels",
"description": "Adds values for all three EffortLevel variants (LOW, MEDIUM, HIGH) in EFFORT_VALUES dict",
"max_score": 20
}
]
}

View file

@ -1,21 +0,0 @@
# Add Candidate Timeout Feature
## Context
The codeflash optimization engine currently has no per-candidate timeout. Some candidates take too long during verification, wasting the optimization budget. A new feature is needed to skip candidates that exceed a configurable time limit during behavioral testing.
The timeout should vary based on the optimization effort setting — shorter timeouts for low effort runs (to save time) and longer for high effort runs (to allow more complex optimizations).
## Task
Implement a `check_candidate_timeout` function in `codeflash/optimization/function_optimizer.py` that:
1. Takes a candidate runtime and returns whether the candidate should be skipped
2. Uses a configurable timeout threshold that scales with optimization effort
3. Handles the error case where the runtime measurement is unavailable
Also add the necessary configuration constant to `codeflash/code_utils/config_consts.py`.
## Expected Outputs
- Modified `function_optimizer.py` with the new function
- Modified `config_consts.py` with the new configuration

View file

@ -1 +0,0 @@
Test patterns and deterministic patch awareness

View file

@ -1,26 +0,0 @@
{
"context": "Tests whether the agent follows codeflash test conventions when writing tests, including path handling, temp directory patterns, and awareness of the deterministic patching system.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Uses tmp_path fixture",
"description": "Test function uses pytest tmp_path fixture parameter, NOT tempfile.NamedTemporaryFile or tempfile.mkdtemp",
"max_score": 25
},
{
"name": "Calls resolve on paths",
"description": "Calls .resolve() on Path objects before using them in assertions or function calls",
"max_score": 25
},
{
"name": "Full string equality",
"description": "Uses exact equality assertions (== or assert_equal) for code string comparisons, NOT substring checks like 'in' or assertIn or contains",
"max_score": 25
},
{
"name": "No real time dependency",
"description": "Test does NOT depend on real time.time(), datetime.now(), random values, or uuid generation for correctness. Acknowledges or accounts for deterministic patches if time/random values are involved.",
"max_score": 25
}
]
}

View file

@ -1,24 +0,0 @@
# Write Tests for Context Hash Comparison
## Context
The codeflash context extraction module has a function `compare_context_hashes(context_a, context_b)` that takes two `CodeOptimizationContext` objects and returns whether their hashing contexts are identical. This is used to detect when the same function has already been optimized.
```python
# In codeflash/context/code_context_extractor.py
def compare_context_hashes(context_a: CodeOptimizationContext, context_b: CodeOptimizationContext) -> bool:
return context_a.hashing_code_context_hash == context_b.hashing_code_context_hash
```
## Task
Write a test file `tests/test_context/test_hash_comparison.py` with tests for this function. Include tests for:
1. Two contexts with identical code producing the same hash
2. Two contexts with different code producing different hashes
3. A context compared with itself
The tests should create temporary Python source files to build realistic context objects.
## Expected Outputs
- `tests/test_context/test_hash_comparison.py`

View file

@ -1 +0,0 @@
Domain type conventions and module identification

View file

@ -1,26 +0,0 @@
{
"context": "Tests whether the agent follows codeflash domain type conventions and correctly identifies the right module when adding a new data type for the optimization pipeline.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Placed in models/models.py",
"description": "New data type is added to codeflash/models/models.py (NOT models/function_types.py, since it has dependencies on other codeflash modules)",
"max_score": 25
},
{
"name": "Uses frozen dataclass",
"description": "Immutable data type uses @dataclass(frozen=True) decorator, NOT a regular class or unfrozen dataclass",
"max_score": 25
},
{
"name": "BaseModel for serializable",
"description": "If a serializable model is needed, uses Pydantic BaseModel (NOT dataclass or dict)",
"max_score": 25
},
{
"name": "Correct module for feature",
"description": "Places the main logic in the correct module for the feature type (e.g., verification/ for test-related, optimization/ for candidate-related, api/ for service-related)",
"max_score": 25
}
]
}

View file

@ -1,21 +0,0 @@
# Add Optimization Confidence Score
## Context
The codeflash team wants to add a confidence score to each optimization result. The score should capture how confident the system is that an optimization is both correct and beneficial. It combines test coverage percentage, number of passing test cases, and speedup stability into a single metric.
The score needs to be:
- Attached to each candidate during evaluation (immutable once computed)
- Included in the final PR report (needs JSON serialization)
- Computed during the candidate evaluation phase
## Task
1. Define the data types needed for the confidence score
2. Write a `compute_confidence_score` function that takes coverage percentage (float), passing test count (int), and stability ratio (float) and returns the confidence result
3. Place all code in the appropriate codeflash modules
## Expected Outputs
- New/modified type definitions in the appropriate models file
- New function in the appropriate module

View file

@ -1 +0,0 @@
Deduplication mechanics and repair trigger conditions

View file

@ -1,26 +0,0 @@
{
"context": "Tests whether the agent understands codeflash's candidate deduplication via AST normalization and the specific conditions under which code repair is triggered vs skipped.",
"type": "weighted_checklist",
"checklist": [
{
"name": "AST normalization",
"description": "Mentions that deduplication uses AST normalization (normalize_code from code_utils/deduplicate_code.py), NOT simple string comparison",
"max_score": 25
},
{
"name": "Duplicate result copying",
"description": "Explains that duplicate candidates copy results from the first-seen candidate rather than being re-tested",
"max_score": 25
},
{
"name": "Repair trigger threshold",
"description": "States that repair triggers when fewer than 2 candidates pass (MIN_CORRECT_CANDIDATES=2), NOT when zero candidates pass or when any candidate fails",
"max_score": 25
},
{
"name": "Unmatched percentage limit",
"description": "Mentions REPAIR_UNMATCHED_PERCENTAGE_LIMIT as a condition that can cause repair to be skipped entirely, with effort-dependent values (0.2/0.3/0.4)",
"max_score": 25
}
]
}

View file

@ -1,17 +0,0 @@
# Investigate Low Candidate Diversity
## Context
A codeflash user is optimizing a data processing function at medium effort level. The AI service returns 5 candidates, but the optimization log shows only 1 candidate was actually benchmarked. Of the 5 candidates, 1 passed behavioral tests but didn't meet the performance threshold. The user wants to understand what happened to the other 4 candidates and why no repair attempts were made.
## Task
Write an analysis document explaining:
1. Why only 1 out of 5 candidates was benchmarked
2. How the system determines which candidates to actually test
3. Under what conditions the system would have attempted to repair the failing candidates
4. What the user could change to get more diverse results
## Expected Outputs
A markdown file `analysis.md` with the explanation.

View file

@ -1,40 +0,0 @@
{
"total_scenarios": 5,
"capabilities_coverage": {
"total_capabilities": 14,
"capabilities_tested": 10,
"coverage_percentage": 71.4
},
"complexity_distribution": {
"basic": 2,
"intermediate": 2,
"advanced": 1
},
"scenarios": [
{
"index": 1,
"capability": "sequential-pipeline-debugging, token-limit-awareness, improvement-threshold",
"complexity": "intermediate"
},
{
"index": 2,
"capability": "result-type-pattern, effort-config-pattern",
"complexity": "intermediate"
},
{
"index": 3,
"capability": "test-patterns, quality-check-commands",
"complexity": "basic"
},
{
"index": 4,
"capability": "domain-type-conventions, module-to-feature-mapping",
"complexity": "basic"
},
{
"index": 5,
"capability": "ast-deduplication, repair-trigger-conditions",
"complexity": "advanced"
}
]
}

View file

@ -1,25 +0,0 @@
{
"total_infeasible": 4,
"infeasible_capabilities": [
{
"capability": "ai-service-error-patterns",
"complexity": "intermediate",
"reasoning": "Requires actual AI service API responses and log output that cannot be meaningfully mocked without bypassing the capability being tested"
},
{
"capability": "behavioral-vs-benchmark-failures",
"complexity": "intermediate",
"reasoning": "Requires actual test execution results with JUnit XML output and timing data that cannot be generated in a one-shot file-based eval"
},
{
"capability": "language-support-patterns",
"complexity": "advanced",
"reasoning": "Requires the full language registry system with imports and decorators that would need the codeflash runtime to verify"
},
{
"capability": "quality-check-commands",
"complexity": "basic",
"reasoning": "Requires running actual uv/prek/mypy commands which need the project environment and dependencies installed"
}
]
}

View file

@ -1,13 +0,0 @@
# Module Reference
| Feature area | Primary module | Key files |
|-------------|----------------|-----------|
| New optimization strategy | `optimization/` | `function_optimizer.py`, `optimizer.py` |
| New test type | `verification/`, `models/` | `test_runner.py`, `pytest_plugin.py`, `test_type.py` |
| New AI service endpoint | `api/` | `aiservice.py` |
| New language support | `languages/` | Create new `languages/<lang>/support.py` |
| Context extraction change | `context/` | `code_context_extractor.py` |
| New CLI command | `cli_cmds/` | `cli.py` |
| New config option | `setup/`, `code_utils/` | `config_consts.py`, `setup/detector.py` |
| Discovery filter | `discovery/` | `functions_to_optimize.py` |
| PR/result changes | `github/`, `result/` | Relevant handlers |

View file

@ -1,146 +0,0 @@
---
name: add-codeflash-feature
description: >
Guides implementation of new functionality in the codeflash optimization engine.
Use when adding a feature, building new functionality, implementing a new
optimization strategy, adding a language backend, creating an API endpoint,
extending the verification pipeline, or developing any new codeflash capability.
Covers module identification, Result type patterns, config, types, tests, and
quality checks.
---
# Add Codeflash Feature
Use this workflow when implementing new functionality in the codeflash codebase — new optimization strategies, language backends, API endpoints, CLI commands, config options, or pipeline extensions.
## Step 1: Identify Target Modules
Determine which module(s) need modification. See [MODULE_REFERENCE.md](MODULE_REFERENCE.md) for the full mapping of feature areas to modules and key files.
**Checkpoint**: Read the target files and understand existing patterns before writing any code. Look for similar features already implemented as reference.
## Step 2: Follow Result Type Pattern
Use the `Result[L, R]` type from `either.py` for error handling in pipeline operations:
```python
from codeflash.either import Success, Failure, is_successful
def my_operation() -> Result[str, MyResultType]:
if error_condition:
return Failure("descriptive error message")
return Success(result_value)
# Usage:
result = my_operation()
if not is_successful(result):
logger.error(result.failure())
return
value = result.unwrap()
```
**Checkpoint**: Verify your function signatures match the `Result` pattern used in surrounding code. Not all functions use `Result` — match the convention of the module you're modifying.
## Step 3: Add Configuration Constants
If the feature needs configurable thresholds or limits:
1. Add constants to `code_utils/config_consts.py`
2. If effort-dependent, add to `EFFORT_VALUES` dict with values for all three levels:
```python
# In config_consts.py:
class EffortKeys(str, Enum):
MY_NEW_KEY = "MY_NEW_KEY"
EFFORT_VALUES: dict[str, dict[EffortLevel, Any]] = {
# ... existing entries ...
EffortKeys.MY_NEW_KEY.value: {
EffortLevel.LOW: 1,
EffortLevel.MEDIUM: 3,
EffortLevel.HIGH: 5,
},
}
```
3. Access via `get_effort_value(EffortKeys.MY_NEW_KEY, effort_level)`
**Checkpoint**: Skip this step if the feature doesn't need configuration. Not every feature requires new constants.
## Step 4: Add Domain Types
If new data structures are needed:
1. Add Pydantic models or frozen dataclasses to `models/models.py` or `models/function_types.py`
2. Use `@dataclass(frozen=True)` for immutable data, `BaseModel` for models that need serialization
3. Keep `function_types.py` dependency-free — no imports from other codeflash modules
Example following existing patterns:
```python
# In models/models.py:
@dataclass(frozen=True)
class MyNewType:
name: str
value: int
source: OptimizedCandidateSource
# For serializable models:
class MyNewModel(BaseModel):
items: list[MyNewType] = []
```
**Checkpoint**: Skip this step if you can reuse existing types. Check `models/models.py` for types that already fit your needs.
## Step 5: Write Tests
Follow existing test patterns:
1. Create test files in `tests/` mirroring the source structure (e.g., `tests/test_optimization/test_my_feature.py`)
2. Use pytest's `tmp_path` fixture for temp directories — never `NamedTemporaryFile`
3. Always call `.resolve()` on Path objects and `.as_posix()` for string conversion
4. Assert full string equality for code context tests — no substring matching
5. The pytest plugin patches `time`, `random`, `uuid`, `datetime` — never rely on real values in verification tests
```python
def test_my_feature(tmp_path: Path) -> None:
test_file = tmp_path / "test_module.py"
test_file.write_text("def foo(): return 1", encoding="utf-8")
result = my_operation(test_file.resolve())
assert is_successful(result)
assert result.unwrap() == expected_value
```
**Checkpoint**: Run the new tests in isolation before proceeding: `uv run pytest tests/path/to/test_file.py -x`
## Step 6: Run Quality Checks
Run all validation before committing:
```bash
# Pre-commit checks (ruff format + lint)
uv run prek run
# Type checking
uv run mypy codeflash/
# Run relevant tests
uv run pytest tests/path/to/relevant/tests -x
```
**If checks fail**:
- `prek run` failures: Fix formatting/lint issues reported by ruff, then re-run
- `mypy` failures: Fix type errors — common issues are missing return types, wrong `Optional` usage, or missing imports in `TYPE_CHECKING` block
- Test failures: Fix the failing test or the implementation, then re-run
## Step 7: Language Support Considerations
If the feature needs to work across languages:
1. Use `get_language_support(identifier)` from `languages/registry.py` — never import language classes directly
2. Current language is a singleton: `set_current_language()` / `current_language()` from `languages/current.py`
3. Use `is_python()` / `is_javascript()` guards for language-specific branches
4. New language support classes must use `@register_language` decorator and be instantiable without arguments
**Checkpoint**: Skip this step if the feature is Python-only. Most features don't need multi-language support.
## Troubleshooting
If you run into issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common problems and fixes (circular imports, `UnsupportedLanguageError`, CI path failures, Pydantic validation errors, token limit exceeded).

View file

@ -1,9 +0,0 @@
# Troubleshooting
| Problem | Likely cause | Fix |
|---------|-------------|-----|
| Circular import at startup | Importing from `models/` in a module loaded early | Move import into `TYPE_CHECKING` block or use lazy import |
| `UnsupportedLanguageError` | Language modules not registered yet | Call `_ensure_languages_registered()` or use `get_language_support()` which does it automatically |
| Tests pass locally but fail in CI | Path differences (absolute vs relative) | Always use `.resolve()` on Path objects |
| `ValidationError` from Pydantic | Invalid code passed to `CodeString` | Check that generated code passes syntax validation for the target language |
| `encoded_tokens_len` exceeds limit | Context too large | Reduce helper functions or split into read-only vs read-writable |

View file

@ -1,124 +0,0 @@
---
name: debug-optimization-failure
description: >
Diagnose why a codeflash optimization produced no results or failed silently.
Use when an optimization run errors out, returns no candidates, or all candidates
are rejected. Walks through discovery, ranking, context limits, AI service,
test verification, deduplication, and repair stages.
---
# Debug Optimization Failure
Use this workflow when an optimization run fails or produces no results. Work through the stages sequentially — stop at the first failure found.
## Step 1: Check Function Discovery
Determine if the function was discovered by `FunctionVisitor`.
1. Search logs for the function name in discovery output:
```python
# In discovery/functions_to_optimize.py, FunctionVisitor filters out:
# - Functions matching exclude patterns in pyproject.toml [tool.codeflash]
# - Functions already optimized (was_function_previously_optimized())
# - Functions outside the configured module-root
```
2. Verify the function file is under the configured `module-root` in `pyproject.toml`
3. Check if the function was previously optimized — look for it in the optimization history
**Checkpoint**: If the function doesn't appear in discovery output, fix config patterns or file location before proceeding.
## Step 2: Check Ranking
If trace data is used, check if the function was ranked high enough.
1. Look at `benchmarking/function_ranker.py` output for the function's addressable time
2. The function must exceed `DEFAULT_IMPORTANCE_THRESHOLD=0.001`:
```python
# Addressable time = own time + callee time / call count
# Grep for the function in ranking output:
# grep -i "function_name" in ranking logs
```
3. Functions below the threshold are silently skipped
**Checkpoint**: If ranked too low, the function doesn't spend enough time to be worth optimizing. No fix needed — this is expected.
## Step 3: Check Context Token Limits
Verify the function's context fits within token limits.
1. Check thresholds in `code_utils/config_consts.py`:
```python
OPTIMIZATION_CONTEXT_TOKEN_LIMIT = 16000 # tokens
TESTGEN_CONTEXT_TOKEN_LIMIT = 16000 # tokens
```
2. Token counting uses `encoded_tokens_len()` from `code_utils/code_utils.py`
3. Common causes: large helper function chains, deep dependency trees, large class hierarchies
**Checkpoint**: If context exceeds limits, the function is rejected. Consider refactoring to reduce dependencies or splitting large modules.
## Step 4: Check AI Service Response
Verify the AI service returned valid candidates.
1. Look for HTTP errors in logs:
```
# Error patterns to search for:
"Error generating optimized candidates"
"Error generating jit rewritten candidate"
"cli-optimize-error-caught"
"cli-optimize-error-response"
```
2. Check `_get_valid_candidates()` in `api/aiservice.py` — empty `code_strings` after `CodeStringsMarkdown.parse_markdown_code()` means the LLM returned malformed code blocks
3. Verify API key is valid (`get_codeflash_api_key()`)
**Checkpoint**: If no candidates returned, check API key, network, and service status before proceeding.
## Step 5: Check Test Failures
Determine if candidates failed behavioral or benchmark tests.
1. **Behavioral failures** — compare return values, stdout, pass/fail between baseline and candidate:
```python
# TestDiffScope enum values to look for:
# RETURN_VALUE - function returned different value
# STDOUT - different stdout output
# DID_PASS - test passed/failed differently
```
2. **Benchmark failures** — candidate must beat `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup)
3. **Stability failures** — timing must be stable within `STABILITY_WINDOW_SIZE=0.35` (35% of iterations)
4. Check JUnit XML test results in the temp directory for specific failure messages
**Checkpoint**: Behavioral failure = optimization changed behavior (check test diffs). Benchmark failure = not fast enough. Stability failure = noisy timing environment.
## Step 6: Check Deduplication
Verify candidates weren't deduplicated away.
1. `CandidateEvaluationContext.ast_code_to_id` tracks normalized AST → candidate mapping
2. `normalize_code()` from `code_utils/deduplicate_code.py` strips comments/whitespace and normalizes the AST
3. If all candidates normalize to identical code, only the first is tested — the rest copy its results
**Checkpoint**: If all duplicates, the LLM generated the same optimization repeatedly. Try a higher effort level for more diverse candidates.
## Step 7: Check Repair/Refinement
If initial candidates failed, check repair and refinement stages.
1. Repair only triggers if fewer than `MIN_CORRECT_CANDIDATES=2` passed behavioral tests
2. Repair sends `AIServiceCodeRepairRequest` with `TestDiff` objects showing what went wrong
3. Check `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` (effort-dependent: 0.2/0.3/0.4) — if too many tests failed, repair is skipped entirely
4. Refinement only runs on the top valid candidates (count depends on effort level)
**Checkpoint**: If repair also fails, the optimization approach likely doesn't work for this function. The function may rely on side effects or external state that the LLM can't safely optimize.
## Key Files Reference
| File | What to check |
|------|---------------|
| `optimization/function_optimizer.py` | Main loop, `determine_best_candidate()` |
| `verification/test_runner.py` | Test subprocess execution |
| `api/aiservice.py` | AI service requests/responses |
| `code_utils/config_consts.py` | All thresholds and limits |
| `context/code_context_extractor.py` | Context extraction and token counting |
| `models/models.py` | `CandidateEvaluationContext`, `TestResults`, `TestDiff` |
| `code_utils/deduplicate_code.py` | AST normalization for deduplication |

View file

@ -1,14 +0,0 @@
{
"name": "codeflash/codeflash-skills",
"version": "0.2.0",
"summary": "Procedural workflows for developing and debugging codeflash",
"private": true,
"skills": {
"debug-optimization-failure": {
"path": "skills/debug-optimization-failure/SKILL.md"
},
"add-codeflash-feature": {
"path": "skills/add-codeflash-feature/SKILL.md"
}
}
}

773
uv.lock

File diff suppressed because it is too large Load diff