mirror of
https://github.com/codeflash-ai/codeflash.git
synced 2026-05-04 18:25:17 +00:00
Merge remote-tracking branch 'origin/main' into call-graphee
# Conflicts: # .codex/skills/.gitignore # .gemini/skills/.gitignore # codeflash/languages/python/context/code_context_extractor.py
This commit is contained in:
commit
2652e71617
82 changed files with 869 additions and 2346 deletions
4
.codex/config.toml
Normal file
4
.codex/config.toml
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
[mcp_servers.tessl]
|
||||
type = "stdio"
|
||||
command = "tessl"
|
||||
args = [ "mcp", "start" ]
|
||||
3
.codex/skills/.gitignore
vendored
3
.codex/skills/.gitignore
vendored
|
|
@ -1,3 +0,0 @@
|
|||
# Managed by Tessl
|
||||
tessl:*
|
||||
tessl__*
|
||||
12
.gemini/settings.json
Normal file
12
.gemini/settings.json
Normal file
|
|
@ -0,0 +1,12 @@
|
|||
{
|
||||
"mcpServers": {
|
||||
"tessl": {
|
||||
"type": "stdio",
|
||||
"command": "tessl",
|
||||
"args": [
|
||||
"mcp",
|
||||
"start"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
3
.gemini/skills/.gitignore
vendored
3
.gemini/skills/.gitignore
vendored
|
|
@ -1,3 +0,0 @@
|
|||
# Managed by Tessl
|
||||
tessl:*
|
||||
tessl__*
|
||||
|
|
@ -128,7 +128,7 @@ def determine_js_package_manager(project_root: Path) -> JsPackageManager:
|
|||
"""
|
||||
# Search from project_root up to filesystem root for lock files
|
||||
# This supports monorepo setups where lock file is at workspace root
|
||||
current_dir = project_root.resolve()
|
||||
current_dir = project_root
|
||||
while current_dir != current_dir.parent:
|
||||
if (current_dir / "bun.lockb").exists() or (current_dir / "bun.lock").exists():
|
||||
return JsPackageManager.BUN
|
||||
|
|
@ -161,7 +161,7 @@ def find_node_modules_with_package(project_root: Path, package_name: str) -> Pat
|
|||
Path to the node_modules directory containing the package, or None if not found.
|
||||
|
||||
"""
|
||||
current_dir = project_root.resolve()
|
||||
current_dir = project_root
|
||||
while current_dir != current_dir.parent:
|
||||
node_modules = current_dir / "node_modules"
|
||||
if node_modules.exists():
|
||||
|
|
|
|||
|
|
@ -709,6 +709,7 @@ def inject_profiling_into_existing_test(
|
|||
tests_project_root: Path,
|
||||
mode: TestingMode = TestingMode.BEHAVIOR,
|
||||
) -> tuple[bool, str | None]:
|
||||
tests_project_root = tests_project_root.resolve()
|
||||
if function_to_optimize.is_async:
|
||||
return inject_async_profiling_into_existing_test(
|
||||
test_path, call_positions, function_to_optimize, tests_project_root, mode
|
||||
|
|
|
|||
|
|
@ -69,8 +69,8 @@ FUNCTION_NAME_REGEX = re.compile(r"([^.]+)\.([a-zA-Z0-9_]+)$")
|
|||
class TestsCache:
|
||||
SCHEMA_VERSION = 1 # Increment this when schema changes
|
||||
|
||||
def __init__(self, project_root_path: str | Path) -> None:
|
||||
self.project_root_path = Path(project_root_path).resolve().as_posix()
|
||||
def __init__(self, project_root_path: Path) -> None:
|
||||
self.project_root_path = project_root_path.resolve().as_posix()
|
||||
self.connection = sqlite3.connect(codeflash_cache_db)
|
||||
self.cur = self.connection.cursor()
|
||||
|
||||
|
|
|
|||
|
|
@ -144,6 +144,27 @@ def find_functions_with_return_statement(ast_module: ast.Module, file_path: Path
|
|||
# Multi-language support helpers
|
||||
# =============================================================================
|
||||
|
||||
_VCS_EXCLUDES = frozenset({".git", ".hg", ".svn"})
|
||||
|
||||
|
||||
def parse_dir_excludes(patterns: frozenset[str]) -> tuple[frozenset[str], tuple[str, ...], tuple[str, ...]]:
|
||||
"""Split glob patterns into exact names, prefixes, and suffixes.
|
||||
|
||||
Patterns ending with ``*`` become prefix matches, patterns starting with ``*``
|
||||
become suffix matches, and plain strings become exact matches.
|
||||
"""
|
||||
exact: set[str] = set()
|
||||
prefixes: list[str] = []
|
||||
suffixes: list[str] = []
|
||||
for p in patterns:
|
||||
if p.endswith("*"):
|
||||
prefixes.append(p[:-1])
|
||||
elif p.startswith("*"):
|
||||
suffixes.append(p[1:])
|
||||
else:
|
||||
exact.add(p)
|
||||
return frozenset(exact), tuple(prefixes), tuple(suffixes)
|
||||
|
||||
|
||||
def get_files_for_language(
|
||||
module_root_path: Path, ignore_paths: list[Path] | None = None, language: Language | None = None
|
||||
|
|
@ -162,37 +183,44 @@ def get_files_for_language(
|
|||
if ignore_paths is None:
|
||||
ignore_paths = []
|
||||
|
||||
all_patterns: frozenset[str]
|
||||
if language is not None:
|
||||
support = get_language_support(language)
|
||||
extensions = support.file_extensions
|
||||
all_patterns = support.dir_excludes | _VCS_EXCLUDES
|
||||
else:
|
||||
extensions = tuple(get_supported_extensions())
|
||||
all_patterns = _VCS_EXCLUDES
|
||||
for lang in Language:
|
||||
if is_language_supported(lang):
|
||||
all_patterns = all_patterns | get_language_support(lang).dir_excludes
|
||||
|
||||
# Default directory patterns to always exclude for JS/TS
|
||||
js_ts_default_excludes = {
|
||||
"node_modules",
|
||||
"dist",
|
||||
"build",
|
||||
".next",
|
||||
".nuxt",
|
||||
"coverage",
|
||||
".cache",
|
||||
".turbo",
|
||||
".vercel",
|
||||
"__pycache__",
|
||||
}
|
||||
dir_excludes, prefixes, suffixes = parse_dir_excludes(all_patterns)
|
||||
|
||||
files = []
|
||||
for ext in extensions:
|
||||
pattern = f"*{ext}"
|
||||
for file_path in module_root_path.rglob(pattern):
|
||||
# Check explicit ignore paths
|
||||
if any(file_path.is_relative_to(ignore_path) for ignore_path in ignore_paths):
|
||||
continue
|
||||
# Check default JS/TS excludes in path parts
|
||||
if any(part in js_ts_default_excludes for part in file_path.parts):
|
||||
continue
|
||||
files.append(file_path)
|
||||
ignore_dirs: set[str] = set()
|
||||
ignore_files: set[Path] = set()
|
||||
for p in ignore_paths:
|
||||
p = Path(p) if not isinstance(p, Path) else p
|
||||
if p.is_file():
|
||||
ignore_files.add(p)
|
||||
else:
|
||||
ignore_dirs.add(str(p))
|
||||
|
||||
files: list[Path] = []
|
||||
for dirpath, dirnames, filenames in os.walk(module_root_path):
|
||||
dirnames[:] = [
|
||||
d
|
||||
for d in dirnames
|
||||
if d not in dir_excludes
|
||||
and not (prefixes and d.startswith(prefixes))
|
||||
and not (suffixes and d.endswith(suffixes))
|
||||
and str(Path(dirpath) / d) not in ignore_dirs
|
||||
]
|
||||
for fname in filenames:
|
||||
if fname.endswith(extensions):
|
||||
fpath = Path(dirpath, fname)
|
||||
if fpath not in ignore_files:
|
||||
files.append(fpath)
|
||||
return files
|
||||
|
||||
|
||||
|
|
@ -804,6 +832,7 @@ def filter_functions(
|
|||
*,
|
||||
disable_logs: bool = False,
|
||||
) -> tuple[dict[Path, list[FunctionToOptimize]], int]:
|
||||
resolved_project_root = project_root.resolve()
|
||||
filtered_modified_functions: dict[str, list[FunctionToOptimize]] = {}
|
||||
blocklist_funcs = get_blocklisted_functions()
|
||||
logger.debug(f"Blocklisted functions: {blocklist_funcs}")
|
||||
|
|
@ -880,7 +909,7 @@ def filter_functions(
|
|||
lang_support = get_language_support(Path(file_path))
|
||||
if lang_support.language == Language.PYTHON:
|
||||
try:
|
||||
ast.parse(f"import {module_name_from_file_path(Path(file_path), project_root)}")
|
||||
ast.parse(f"import {module_name_from_file_path(Path(file_path), resolved_project_root)}")
|
||||
except SyntaxError:
|
||||
malformed_paths_count += 1
|
||||
continue
|
||||
|
|
@ -902,7 +931,10 @@ def filter_functions(
|
|||
if previous_checkpoint_functions:
|
||||
functions_tmp = []
|
||||
for function in _functions:
|
||||
if function.qualified_name_with_modules_from_root(project_root) in previous_checkpoint_functions:
|
||||
if (
|
||||
function.qualified_name_with_modules_from_root(resolved_project_root)
|
||||
in previous_checkpoint_functions
|
||||
):
|
||||
previous_checkpoint_functions_removed_count += 1
|
||||
continue
|
||||
functions_tmp.append(function)
|
||||
|
|
|
|||
|
|
@ -294,6 +294,14 @@ class LanguageSupport(Protocol):
|
|||
"""Like # or //."""
|
||||
...
|
||||
|
||||
@property
|
||||
def dir_excludes(self) -> frozenset[str]:
|
||||
"""Directory name patterns to skip during file discovery.
|
||||
|
||||
Supports glob wildcards: "name" for exact, "prefix*" for startswith, "*suffix" for endswith.
|
||||
"""
|
||||
...
|
||||
|
||||
# === Discovery ===
|
||||
|
||||
def discover_functions(
|
||||
|
|
|
|||
|
|
@ -44,8 +44,7 @@ class ImportResolver:
|
|||
project_root: Root directory of the project.
|
||||
|
||||
"""
|
||||
# Resolve to real path to handle macOS symlinks like /var -> /private/var
|
||||
self.project_root = project_root.resolve()
|
||||
self.project_root = project_root
|
||||
self._resolution_cache: dict[tuple[Path, str], Path | None] = {}
|
||||
|
||||
def resolve_import(self, import_info: ImportInfo, source_file: Path) -> ResolvedImport | None:
|
||||
|
|
|
|||
|
|
@ -63,6 +63,10 @@ class JavaScriptSupport:
|
|||
def comment_prefix(self) -> str:
|
||||
return "//"
|
||||
|
||||
@property
|
||||
def dir_excludes(self) -> frozenset[str]:
|
||||
return frozenset({"node_modules", "dist", "build", ".next", ".nuxt", "coverage", ".cache", ".turbo", ".vercel"})
|
||||
|
||||
# === Discovery ===
|
||||
|
||||
def discover_functions(
|
||||
|
|
|
|||
|
|
@ -2,7 +2,6 @@ from __future__ import annotations
|
|||
|
||||
import ast
|
||||
import hashlib
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
from pathlib import Path
|
||||
|
|
@ -37,7 +36,7 @@ from codeflash.optimization.function_context import belongs_to_function_qualifie
|
|||
if TYPE_CHECKING:
|
||||
from jedi.api.classes import Name
|
||||
|
||||
from codeflash.languages.base import DependencyResolver, HelperFunction
|
||||
from codeflash.languages.base import HelperFunction
|
||||
from codeflash.languages.python.context.unused_definition_remover import UsageInfo
|
||||
|
||||
# Error message constants
|
||||
|
|
@ -81,7 +80,6 @@ def get_code_optimization_context(
|
|||
project_root_path: Path,
|
||||
optim_token_limit: int = OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
|
||||
testgen_token_limit: int = TESTGEN_CONTEXT_TOKEN_LIMIT,
|
||||
call_graph: DependencyResolver | None = None,
|
||||
) -> CodeOptimizationContext:
|
||||
# Route to language-specific implementation for non-Python languages
|
||||
if not is_python():
|
||||
|
|
@ -90,11 +88,9 @@ def get_code_optimization_context(
|
|||
)
|
||||
|
||||
# Get FunctionSource representation of helpers of FTO
|
||||
fto_input = {function_to_optimize.file_path: {function_to_optimize.qualified_name}}
|
||||
if call_graph is not None:
|
||||
helpers_of_fto_dict, helpers_of_fto_list = call_graph.get_callees(fto_input)
|
||||
else:
|
||||
helpers_of_fto_dict, helpers_of_fto_list = get_function_sources_from_jedi(fto_input, project_root_path)
|
||||
helpers_of_fto_dict, helpers_of_fto_list = get_function_sources_from_jedi(
|
||||
{function_to_optimize.file_path: {function_to_optimize.qualified_name}}, project_root_path
|
||||
)
|
||||
|
||||
# Add function to optimize into helpers of FTO dict, as they'll be processed together
|
||||
fto_as_function_source = get_function_to_optimize_as_function_source(function_to_optimize, project_root_path)
|
||||
|
|
@ -110,13 +106,9 @@ def get_code_optimization_context(
|
|||
for qualified_names in helpers_of_fto_qualified_names_dict.values():
|
||||
qualified_names.update({f"{qn.rsplit('.', 1)[0]}.__init__" for qn in qualified_names if "." in qn})
|
||||
|
||||
# Get FunctionSource representation of helpers of helpers of FTO
|
||||
if call_graph is not None:
|
||||
helpers_of_helpers_dict, _helpers_of_helpers_list = call_graph.get_callees(helpers_of_fto_qualified_names_dict)
|
||||
else:
|
||||
helpers_of_helpers_dict, _helpers_of_helpers_list = get_function_sources_from_jedi(
|
||||
helpers_of_fto_qualified_names_dict, project_root_path
|
||||
)
|
||||
helpers_of_helpers_dict, helpers_of_helpers_list = get_function_sources_from_jedi(
|
||||
helpers_of_fto_qualified_names_dict, project_root_path
|
||||
)
|
||||
|
||||
# Extract code context for optimization
|
||||
final_read_writable_code = extract_code_markdown_context_from_files(
|
||||
|
|
@ -192,6 +184,8 @@ def get_code_optimization_context(
|
|||
code_hash_context = hashing_code_context.markdown
|
||||
code_hash = hashlib.sha256(code_hash_context.encode("utf-8")).hexdigest()
|
||||
|
||||
all_helper_fqns = list({fs.fully_qualified_name for fs in helpers_of_fto_list + helpers_of_helpers_list})
|
||||
|
||||
return CodeOptimizationContext(
|
||||
testgen_context=testgen_context,
|
||||
read_writable_code=final_read_writable_code,
|
||||
|
|
@ -199,6 +193,7 @@ def get_code_optimization_context(
|
|||
hashing_code_context=code_hash_context,
|
||||
hashing_code_context_hash=code_hash,
|
||||
helper_functions=helpers_of_fto_list,
|
||||
testgen_helper_fqns=all_helper_fqns,
|
||||
preexisting_objects=preexisting_objects,
|
||||
)
|
||||
|
||||
|
|
@ -257,7 +252,7 @@ def get_code_optimization_context_for_language(
|
|||
fully_qualified_name=helper.qualified_name,
|
||||
only_function_name=helper.name,
|
||||
source_code=helper.source_code,
|
||||
definition_type=None,
|
||||
jedi_definition=None,
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -323,13 +318,12 @@ def get_code_optimization_context_for_language(
|
|||
return CodeOptimizationContext(
|
||||
testgen_context=testgen_context,
|
||||
read_writable_code=read_writable_code,
|
||||
# Pass type definitions and globals as read-only context for the AI
|
||||
# This way the AI sees them as context but doesn't include them in optimized output
|
||||
read_only_context_code=code_context.read_only_context,
|
||||
hashing_code_context=read_writable_code.flat,
|
||||
hashing_code_context_hash=code_hash,
|
||||
helper_functions=helper_function_sources,
|
||||
preexisting_objects=set(), # Not implemented for non-Python yet
|
||||
testgen_helper_fqns=[fs.fully_qualified_name for fs in helper_function_sources],
|
||||
preexisting_objects=set(),
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -480,7 +474,7 @@ def get_function_to_optimize_as_function_source(
|
|||
fully_qualified_name=name.full_name,
|
||||
only_function_name=name.name,
|
||||
source_code=name.get_line_code(),
|
||||
definition_type=name.type,
|
||||
jedi_definition=name,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.exception(f"Error while getting function source: {e}")
|
||||
|
|
@ -517,6 +511,10 @@ def get_function_sources_from_jedi(
|
|||
# TODO: there can be multiple definitions, see how to handle such cases
|
||||
definition = definitions[0]
|
||||
definition_path = definition.module_path
|
||||
if definition_path is not None:
|
||||
rel = safe_relative_to(definition_path, project_root_path)
|
||||
if not rel.is_absolute():
|
||||
definition_path = project_root_path / rel
|
||||
|
||||
# The definition is part of this project and not defined within the original function
|
||||
is_valid_definition = (
|
||||
|
|
@ -525,15 +523,16 @@ def get_function_sources_from_jedi(
|
|||
and not belongs_to_function_qualified(definition, qualified_function_name)
|
||||
and definition.full_name.startswith(definition.module_name)
|
||||
)
|
||||
if is_valid_definition and definition.type in ("function", "class"):
|
||||
if is_valid_definition and definition.type in ("function", "class", "statement"):
|
||||
if definition.type == "function":
|
||||
fqn = definition.full_name
|
||||
func_name = definition.name
|
||||
else:
|
||||
# When a class is instantiated (e.g., MyClass()), track its __init__ as a helper
|
||||
# This ensures the class definition with constructor is included in testgen context
|
||||
elif definition.type == "class":
|
||||
fqn = f"{definition.full_name}.__init__"
|
||||
func_name = "__init__"
|
||||
else:
|
||||
fqn = definition.full_name
|
||||
func_name = definition.name
|
||||
qualified_name = get_qualified_name(definition.module_name, fqn)
|
||||
# Avoid nested functions or classes. Only class.function is allowed
|
||||
if len(qualified_name.split(".")) <= 2:
|
||||
|
|
@ -543,7 +542,7 @@ def get_function_sources_from_jedi(
|
|||
fully_qualified_name=fqn,
|
||||
only_function_name=func_name,
|
||||
source_code=definition.get_line_code(),
|
||||
definition_type=definition.type,
|
||||
jedi_definition=definition,
|
||||
)
|
||||
file_path_to_function_source[definition_path].add(function_source)
|
||||
function_source_list.append(function_source)
|
||||
|
|
@ -940,7 +939,11 @@ def is_project_path(module_path: Path | None, project_root_path: Path) -> bool:
|
|||
# site-packages must be checked first because .venv/site-packages is under project root
|
||||
if path_belongs_to_site_packages(module_path):
|
||||
return False
|
||||
return str(module_path).startswith(str(project_root_path) + os.sep)
|
||||
try:
|
||||
module_path.resolve().relative_to(project_root_path.resolve())
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def _is_project_module(module_name: str, project_root_path: Path) -> bool:
|
||||
|
|
|
|||
|
|
@ -587,17 +587,20 @@ def revert_unused_helper_functions(
|
|||
|
||||
logger.debug(f"Reverting {len(unused_helpers)} unused helper function(s) to original definitions")
|
||||
|
||||
# Resolve all path keys for consistent comparison (Windows 8.3 short names may differ from Jedi-resolved paths)
|
||||
resolved_original_helper_code = {p.resolve(): code for p, code in original_helper_code.items()}
|
||||
|
||||
# Group unused helpers by file path
|
||||
unused_helpers_by_file = defaultdict(list)
|
||||
for helper in unused_helpers:
|
||||
unused_helpers_by_file[helper.file_path].append(helper)
|
||||
unused_helpers_by_file[helper.file_path.resolve()].append(helper)
|
||||
|
||||
# For each file, revert the unused helper functions to their original definitions
|
||||
for file_path, helpers_in_file in unused_helpers_by_file.items():
|
||||
if file_path in original_helper_code:
|
||||
if file_path in resolved_original_helper_code:
|
||||
try:
|
||||
# Get original code for this file
|
||||
original_code = original_helper_code[file_path]
|
||||
original_code = resolved_original_helper_code[file_path]
|
||||
|
||||
# Use the code replacer to selectively revert only the unused helper functions
|
||||
helper_names = [helper.qualified_name for helper in helpers_in_file]
|
||||
|
|
|
|||
|
|
@ -76,6 +76,37 @@ class PythonSupport:
|
|||
def comment_prefix(self) -> str:
|
||||
return "#"
|
||||
|
||||
@property
|
||||
def dir_excludes(self) -> frozenset[str]:
|
||||
return frozenset(
|
||||
{
|
||||
"__pycache__",
|
||||
".venv",
|
||||
"venv",
|
||||
".tox",
|
||||
".nox",
|
||||
".eggs",
|
||||
".mypy_cache",
|
||||
".ruff_cache",
|
||||
".pytest_cache",
|
||||
".hypothesis",
|
||||
"htmlcov",
|
||||
".pytype",
|
||||
".pyre",
|
||||
".pybuilder",
|
||||
".ipynb_checkpoints",
|
||||
".codeflash",
|
||||
".cache",
|
||||
".complexipy_cache",
|
||||
"build",
|
||||
"dist",
|
||||
"sdist",
|
||||
".coverage*",
|
||||
".pyright*",
|
||||
"*.egg-info",
|
||||
}
|
||||
)
|
||||
|
||||
# === Discovery ===
|
||||
|
||||
def discover_functions(
|
||||
|
|
|
|||
|
|
@ -379,6 +379,7 @@ class CodeOptimizationContext(BaseModel):
|
|||
hashing_code_context: str = ""
|
||||
hashing_code_context_hash: str = ""
|
||||
helper_functions: list[FunctionSource]
|
||||
testgen_helper_fqns: list[str] = []
|
||||
preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]]
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ from __future__ import annotations
|
|||
|
||||
import ast
|
||||
import concurrent.futures
|
||||
import dataclasses
|
||||
import logging
|
||||
import os
|
||||
import queue
|
||||
|
|
@ -445,9 +446,12 @@ class FunctionOptimizer:
|
|||
replay_tests_dir: Path | None = None,
|
||||
call_graph: DependencyResolver | None = None,
|
||||
) -> None:
|
||||
self.project_root = test_cfg.project_root_path
|
||||
self.project_root = test_cfg.project_root_path.resolve()
|
||||
self.test_cfg = test_cfg
|
||||
self.aiservice_client = aiservice_client if aiservice_client else AiServiceClient()
|
||||
resolved_file_path = function_to_optimize.file_path.resolve()
|
||||
if resolved_file_path != function_to_optimize.file_path:
|
||||
function_to_optimize = dataclasses.replace(function_to_optimize, file_path=resolved_file_path)
|
||||
self.function_to_optimize = function_to_optimize
|
||||
self.function_to_optimize_source_code = (
|
||||
function_to_optimize_source_code
|
||||
|
|
@ -582,6 +586,7 @@ class FunctionOptimizer:
|
|||
test_results = self.generate_tests(
|
||||
testgen_context=code_context.testgen_context,
|
||||
helper_functions=code_context.helper_functions,
|
||||
testgen_helper_fqns=code_context.testgen_helper_fqns,
|
||||
generated_test_paths=generated_test_paths,
|
||||
generated_perf_test_paths=generated_perf_test_paths,
|
||||
)
|
||||
|
|
@ -1453,7 +1458,7 @@ class FunctionOptimizer:
|
|||
optimized_code = ""
|
||||
if optimized_context is not None:
|
||||
file_to_code_context = optimized_context.file_to_path()
|
||||
optimized_code = file_to_code_context.get(str(path.relative_to(self.project_root)), "")
|
||||
optimized_code = file_to_code_context.get(str(path.resolve().relative_to(self.project_root)), "")
|
||||
|
||||
new_code = format_code(
|
||||
self.args.formatter_cmds, path, optimized_code=optimized_code, check_diff=True, exit_on_failure=False
|
||||
|
|
@ -1524,7 +1529,8 @@ class FunctionOptimizer:
|
|||
read_only_context_code=new_code_ctx.read_only_context_code,
|
||||
hashing_code_context=new_code_ctx.hashing_code_context,
|
||||
hashing_code_context_hash=new_code_ctx.hashing_code_context_hash,
|
||||
helper_functions=new_code_ctx.helper_functions, # only functions that are read writable
|
||||
helper_functions=new_code_ctx.helper_functions,
|
||||
testgen_helper_fqns=new_code_ctx.testgen_helper_fqns,
|
||||
preexisting_objects=new_code_ctx.preexisting_objects,
|
||||
)
|
||||
)
|
||||
|
|
@ -1730,6 +1736,7 @@ class FunctionOptimizer:
|
|||
self,
|
||||
testgen_context: CodeStringsMarkdown,
|
||||
helper_functions: list[FunctionSource],
|
||||
testgen_helper_fqns: list[str],
|
||||
generated_test_paths: list[Path],
|
||||
generated_perf_test_paths: list[Path],
|
||||
) -> Result[tuple[int, GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str], str]:
|
||||
|
|
@ -1738,13 +1745,9 @@ class FunctionOptimizer:
|
|||
assert len(generated_test_paths) == n_tests
|
||||
|
||||
if not self.args.no_gen_tests:
|
||||
# Submit test generation tasks
|
||||
helper_fqns = testgen_helper_fqns or [definition.fully_qualified_name for definition in helper_functions]
|
||||
future_tests = self.submit_test_generation_tasks(
|
||||
self.executor,
|
||||
testgen_context.markdown,
|
||||
[definition.fully_qualified_name for definition in helper_functions],
|
||||
generated_test_paths,
|
||||
generated_perf_test_paths,
|
||||
self.executor, testgen_context.markdown, helper_fqns, generated_test_paths, generated_perf_test_paths
|
||||
)
|
||||
|
||||
future_concolic_tests = self.executor.submit(
|
||||
|
|
|
|||
|
|
@ -126,10 +126,10 @@ def existing_tests_source_for(
|
|||
tests_dir_name = test_cfg.tests_project_rootdir.name
|
||||
if file_path.startswith((tests_dir_name + os.sep, tests_dir_name + "/")):
|
||||
# Module path includes "tests." - use project root parent
|
||||
instrumented_abs_path = (test_cfg.tests_project_rootdir.parent / file_path).resolve()
|
||||
instrumented_abs_path = test_cfg.tests_project_rootdir.parent / file_path
|
||||
else:
|
||||
# Module path doesn't include tests dir - use tests root directly
|
||||
instrumented_abs_path = (test_cfg.tests_project_rootdir / file_path).resolve()
|
||||
instrumented_abs_path = test_cfg.tests_project_rootdir / file_path
|
||||
logger.debug(f"[PR-DEBUG] Looking up: {instrumented_abs_path}")
|
||||
logger.debug(f"[PR-DEBUG] Available keys: {list(instrumented_to_original.keys())[:3]}")
|
||||
# Try to map instrumented path to original path
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import logging
|
|||
|
||||
import sentry_sdk
|
||||
from sentry_sdk.integrations.logging import LoggingIntegration
|
||||
from sentry_sdk.integrations.stdlib import StdlibIntegration
|
||||
|
||||
|
||||
def init_sentry(*, enabled: bool = False, exclude_errors: bool = False) -> None:
|
||||
|
|
@ -16,12 +17,8 @@ def init_sentry(*, enabled: bool = False, exclude_errors: bool = False) -> None:
|
|||
sentry_sdk.init(
|
||||
dsn="https://4b9a1902f9361b48c04376df6483bc96@o4506833230561280.ingest.sentry.io/4506833262477312",
|
||||
integrations=[sentry_logging],
|
||||
# Set traces_sample_rate to 1.0 to capture 100%
|
||||
# of transactions for performance monitoring.
|
||||
traces_sample_rate=1.0,
|
||||
# Set profiles_sample_rate to 1.0 to profile 100%
|
||||
# of sampled transactions.
|
||||
# We recommend adjusting this value in production.
|
||||
profiles_sample_rate=1.0,
|
||||
disabled_integrations=[StdlibIntegration],
|
||||
traces_sample_rate=0,
|
||||
profiles_sample_rate=0,
|
||||
ignore_errors=[KeyboardInterrupt],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import importlib.util
|
||||
import subprocess
|
||||
import tempfile
|
||||
import time
|
||||
|
|
@ -18,6 +19,8 @@ from codeflash.lsp.helpers import is_LSP_enabled
|
|||
from codeflash.telemetry.posthog_cf import ph
|
||||
from codeflash.verification.verification_utils import TestConfig
|
||||
|
||||
CROSSHAIR_AVAILABLE = importlib.util.find_spec("crosshair") is not None
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from argparse import Namespace
|
||||
|
||||
|
|
@ -52,6 +55,10 @@ def generate_concolic_tests(
|
|||
logger.debug("Skipping concolic test generation for non-Python languages (CrossHair is Python-only)")
|
||||
return function_to_concolic_tests, concolic_test_suite_code
|
||||
|
||||
if not CROSSHAIR_AVAILABLE:
|
||||
logger.debug("Skipping concolic test generation (crosshair-tool is not installed)")
|
||||
return function_to_concolic_tests, concolic_test_suite_code
|
||||
|
||||
if is_LSP_enabled():
|
||||
logger.debug("Skipping concolic test generation in LSP mode")
|
||||
return function_to_concolic_tests, concolic_test_suite_code
|
||||
|
|
|
|||
|
|
@ -47,8 +47,24 @@ def parse_func(file_path: Path) -> XMLParser:
|
|||
return parse(file_path, xml_parser)
|
||||
|
||||
|
||||
matches_re_start = re.compile(r"!\$######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######\$!\n")
|
||||
matches_re_end = re.compile(r"!######(.*?):(.*?)([^\.:]*?):(.*?):(.*?):(.*?)######!")
|
||||
matches_re_start = re.compile(
|
||||
r"!\$######([^:]*)" # group 1: module path
|
||||
r":((?:[^:.]*\.)*)" # group 2: class prefix with trailing dot, or empty
|
||||
r"([^.:]*)" # group 3: test function name
|
||||
r":([^:]*)" # group 4: function being tested
|
||||
r":([^:]*)" # group 5: loop index
|
||||
r":([^#]*)" # group 6: iteration id
|
||||
r"######\$!\n"
|
||||
)
|
||||
matches_re_end = re.compile(
|
||||
r"!######([^:]*)" # group 1: module path
|
||||
r":((?:[^:.]*\.)*)" # group 2: class prefix with trailing dot, or empty
|
||||
r"([^.:]*)" # group 3: test function name
|
||||
r":([^:]*)" # group 4: function being tested
|
||||
r":([^:]*)" # group 5: loop index
|
||||
r":([^#]*)" # group 6: iteration_id or iteration_id:runtime
|
||||
r"######!"
|
||||
)
|
||||
|
||||
|
||||
start_pattern = re.compile(r"!\$######([^:]*):([^:]*):([^:]*):([^:]*):([^:]+)######\$!")
|
||||
|
|
@ -893,7 +909,6 @@ def merge_test_results(
|
|||
return merged_test_results
|
||||
|
||||
|
||||
FAILURES_HEADER_RE = re.compile(r"=+ FAILURES =+")
|
||||
TEST_HEADER_RE = re.compile(r"_{3,}\s*(.*?)\s*_{3,}$")
|
||||
|
||||
|
||||
|
|
@ -903,7 +918,7 @@ def parse_test_failures_from_stdout(stdout: str) -> dict[str, str]:
|
|||
start = end = None
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
if FAILURES_HEADER_RE.search(line.strip()):
|
||||
if "= FAILURES =" in line:
|
||||
start = i
|
||||
break
|
||||
|
||||
|
|
|
|||
|
|
@ -158,6 +158,10 @@ class TestConfig:
|
|||
_language: Optional[str] = None # Language identifier for multi-language support
|
||||
js_project_root: Optional[Path] = None # JavaScript project root (directory containing package.json)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.project_root_path = self.project_root_path.resolve()
|
||||
self.tests_project_rootdir = self.tests_project_rootdir.resolve()
|
||||
|
||||
@property
|
||||
def test_framework(self) -> str:
|
||||
"""Returns the appropriate test framework based on language.
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ dependencies = [
|
|||
"dill>=0.3.8",
|
||||
"rich>=13.8.1",
|
||||
"lxml>=5.3.0",
|
||||
"crosshair-tool>=0.0.78",
|
||||
"crosshair-tool>=0.0.78; python_version < '3.15'",
|
||||
"coverage>=7.6.4",
|
||||
"line_profiler>=4.2.0",
|
||||
"platformdirs>=4.3.7",
|
||||
|
|
@ -207,6 +207,8 @@ warn_unreachable = true
|
|||
install_types = true
|
||||
plugins = ["pydantic.mypy"]
|
||||
|
||||
exclude = ["tests/", "code_to_optimize/", "pie_test_set/", "experiments/"]
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = ["jedi", "jedi.api.classes", "inquirer", "inquirer.themes", "numba"]
|
||||
ignore_missing_imports = true
|
||||
|
|
@ -310,6 +312,9 @@ split-on-trailing-comma = false
|
|||
docstring-code-format = true
|
||||
skip-magic-trailing-comma = true
|
||||
|
||||
[tool.ty.src]
|
||||
exclude = ["tests", "code_to_optimize", "pie_test_set", "experiments"]
|
||||
|
||||
[tool.hatch.version]
|
||||
source = "uv-dynamic-versioning"
|
||||
|
||||
|
|
|
|||
27
tessl.json
27
tessl.json
|
|
@ -20,7 +20,7 @@
|
|||
"version": "0.13.0"
|
||||
},
|
||||
"tessl/pypi-pydantic": {
|
||||
"version": "1.10.0"
|
||||
"version": "2.11.0"
|
||||
},
|
||||
"tessl/pypi-humanize": {
|
||||
"version": "4.13.0"
|
||||
|
|
@ -35,7 +35,7 @@
|
|||
"version": "3.4.0"
|
||||
},
|
||||
"tessl/pypi-sentry-sdk": {
|
||||
"version": "1.45.0"
|
||||
"version": "2.36.0"
|
||||
},
|
||||
"tessl/pypi-parameterized": {
|
||||
"version": "0.9.0"
|
||||
|
|
@ -44,10 +44,10 @@
|
|||
"version": "0.4.0"
|
||||
},
|
||||
"tessl/pypi-rich": {
|
||||
"version": "13.9.0"
|
||||
"version": "14.1.0"
|
||||
},
|
||||
"tessl/pypi-lxml": {
|
||||
"version": "5.4.0"
|
||||
"version": "6.0.0"
|
||||
},
|
||||
"tessl/pypi-crosshair-tool": {
|
||||
"version": "0.0.0"
|
||||
|
|
@ -64,17 +64,20 @@
|
|||
"tessl/pypi-filelock": {
|
||||
"version": "3.19.0"
|
||||
},
|
||||
"codeflash/codeflash-rules": {
|
||||
"version": "0.1.0"
|
||||
"tessl/pypi-ipython": {
|
||||
"version": "9.5.0"
|
||||
},
|
||||
"codeflash/codeflash-docs": {
|
||||
"version": "0.1.0"
|
||||
"tessl/pypi-mypy": {
|
||||
"version": "1.17.0"
|
||||
},
|
||||
"codeflash/codeflash-skills": {
|
||||
"version": "0.2.0"
|
||||
"tessl/pypi-ty": {
|
||||
"version": "0.0.0"
|
||||
},
|
||||
"tessl-labs/tessl-skill-eval-scenarios": {
|
||||
"version": "0.0.5"
|
||||
"tessl/pypi-types-jsonschema": {
|
||||
"version": "3.2.0"
|
||||
},
|
||||
"tessl/pypi-uv": {
|
||||
"version": "0.8.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1171,7 +1171,12 @@ def test_repo_helper() -> None:
|
|||
code_ctx = get_code_optimization_context(function_to_optimize, project_root)
|
||||
read_write_context, read_only_context = code_ctx.read_writable_code, code_ctx.read_only_context_code
|
||||
hashing_context = code_ctx.hashing_code_context
|
||||
path_to_globals = project_root / "globals.py"
|
||||
expected_read_write_context = f"""
|
||||
```python:{path_to_globals.relative_to(project_root)}
|
||||
# Define a global variable
|
||||
API_URL = "https://api.example.com/data"
|
||||
```
|
||||
```python:{path_to_utils.relative_to(project_root)}
|
||||
import math
|
||||
|
||||
|
|
@ -1264,7 +1269,12 @@ def test_repo_helper_of_helper() -> None:
|
|||
code_ctx = get_code_optimization_context(function_to_optimize, project_root)
|
||||
read_write_context, read_only_context = code_ctx.read_writable_code, code_ctx.read_only_context_code
|
||||
hashing_context = code_ctx.hashing_code_context
|
||||
path_to_globals = project_root / "globals.py"
|
||||
expected_read_write_context = f"""
|
||||
```python:{path_to_globals.relative_to(project_root)}
|
||||
# Define a global variable
|
||||
API_URL = "https://api.example.com/data"
|
||||
```
|
||||
```python:{path_to_utils.relative_to(project_root)}
|
||||
import math
|
||||
from transform_utils import DataTransformer
|
||||
|
|
@ -1991,6 +2001,8 @@ class Calculator:
|
|||
"""
|
||||
expected_read_only_context = """
|
||||
```python:utility_module.py
|
||||
import sys
|
||||
|
||||
DEFAULT_PRECISION = "medium"
|
||||
|
||||
# Try-except block with variable definitions
|
||||
|
|
@ -2001,6 +2013,17 @@ except ImportError:
|
|||
# Used variable in except block
|
||||
CALCULATION_BACKEND = "python"
|
||||
|
||||
# Nested if-else with variable definitions
|
||||
if sys.platform.startswith('win'):
|
||||
# Used variable in outer if
|
||||
SYSTEM_TYPE = "windows"
|
||||
elif sys.platform.startswith('linux'):
|
||||
# Used variable in outer elif
|
||||
SYSTEM_TYPE = "linux"
|
||||
else:
|
||||
# Used variable in outer else
|
||||
SYSTEM_TYPE = "other"
|
||||
|
||||
# Function that will be used in the main code
|
||||
def select_precision(precision, fallback_precision):
|
||||
if precision is None:
|
||||
|
|
@ -2207,6 +2230,8 @@ def get_system_details():
|
|||
relative_path = file_path.relative_to(project_root)
|
||||
expected_read_write_context = f"""
|
||||
```python:utility_module.py
|
||||
import sys
|
||||
|
||||
DEFAULT_PRECISION = "medium"
|
||||
|
||||
# Try-except block with variable definitions
|
||||
|
|
@ -2217,6 +2242,17 @@ except ImportError:
|
|||
# Used variable in except block
|
||||
CALCULATION_BACKEND = "python"
|
||||
|
||||
# Nested if-else with variable definitions
|
||||
if sys.platform.startswith('win'):
|
||||
# Used variable in outer if
|
||||
SYSTEM_TYPE = "windows"
|
||||
elif sys.platform.startswith('linux'):
|
||||
# Used variable in outer elif
|
||||
SYSTEM_TYPE = "linux"
|
||||
else:
|
||||
# Used variable in outer else
|
||||
SYSTEM_TYPE = "other"
|
||||
|
||||
# Function that will be used in the main code
|
||||
def select_precision(precision, fallback_precision):
|
||||
if precision is None:
|
||||
|
|
@ -2257,6 +2293,8 @@ class Calculator:
|
|||
"""
|
||||
expected_read_only_context = """
|
||||
```python:utility_module.py
|
||||
import sys
|
||||
|
||||
DEFAULT_PRECISION = "medium"
|
||||
|
||||
# Try-except block with variable definitions
|
||||
|
|
@ -2266,6 +2304,17 @@ try:
|
|||
except ImportError:
|
||||
# Used variable in except block
|
||||
CALCULATION_BACKEND = "python"
|
||||
|
||||
# Nested if-else with variable definitions
|
||||
if sys.platform.startswith('win'):
|
||||
# Used variable in outer if
|
||||
SYSTEM_TYPE = "windows"
|
||||
elif sys.platform.startswith('linux'):
|
||||
# Used variable in outer elif
|
||||
SYSTEM_TYPE = "linux"
|
||||
else:
|
||||
# Used variable in outer else
|
||||
SYSTEM_TYPE = "other"
|
||||
```
|
||||
"""
|
||||
assert read_write_context.markdown.strip() == expected_read_write_context.strip()
|
||||
|
|
|
|||
189
tests/test_parse_test_output_regex.py
Normal file
189
tests/test_parse_test_output_regex.py
Normal file
|
|
@ -0,0 +1,189 @@
|
|||
"""Tests for the regex patterns and string matching in parse_test_output.py."""
|
||||
|
||||
from codeflash.verification.parse_test_output import (
|
||||
matches_re_end,
|
||||
matches_re_start,
|
||||
parse_test_failures_from_stdout,
|
||||
)
|
||||
|
||||
|
||||
# --- matches_re_start tests ---
|
||||
|
||||
|
||||
class TestMatchesReStart:
|
||||
def test_simple_no_class(self) -> None:
|
||||
s = "!$######tests.test_foo:test_bar:target_func:1:abc######$!\n"
|
||||
m = matches_re_start.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("tests.test_foo", "", "test_bar", "target_func", "1", "abc")
|
||||
|
||||
def test_with_class(self) -> None:
|
||||
s = "!$######tests.test_foo:MyClass.test_bar:target_func:1:abc######$!\n"
|
||||
m = matches_re_start.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("tests.test_foo", "MyClass.", "test_bar", "target_func", "1", "abc")
|
||||
|
||||
def test_nested_class(self) -> None:
|
||||
s = "!$######a.b.c:A.B.test_x:func:3:id123######$!\n"
|
||||
m = matches_re_start.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("a.b.c", "A.B.", "test_x", "func", "3", "id123")
|
||||
|
||||
def test_empty_class_and_function(self) -> None:
|
||||
s = "!$######mod::func:0:iter######$!\n"
|
||||
m = matches_re_start.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("mod", "", "", "func", "0", "iter")
|
||||
|
||||
def test_embedded_in_stdout(self) -> None:
|
||||
s = "some output\n!$######mod:test_fn:f:1:x######$!\nmore output\n"
|
||||
m = matches_re_start.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("mod", "", "test_fn", "f", "1", "x")
|
||||
|
||||
def test_multiple_matches(self) -> None:
|
||||
s = (
|
||||
"!$######m1:C1.fn1:t1:1:a######$!\n"
|
||||
"!$######m2:fn2:t2:2:b######$!\n"
|
||||
)
|
||||
matches = list(matches_re_start.finditer(s))
|
||||
assert len(matches) == 2
|
||||
assert matches[0].groups() == ("m1", "C1.", "fn1", "t1", "1", "a")
|
||||
assert matches[1].groups() == ("m2", "", "fn2", "t2", "2", "b")
|
||||
|
||||
def test_no_match_without_newline(self) -> None:
|
||||
s = "!$######mod:test_fn:f:1:x######$!"
|
||||
m = matches_re_start.search(s)
|
||||
assert m is None
|
||||
|
||||
def test_dots_in_module_path(self) -> None:
|
||||
s = "!$######a.b.c.d.e:test_fn:f:1:x######$!\n"
|
||||
m = matches_re_start.search(s)
|
||||
assert m is not None
|
||||
assert m.group(1) == "a.b.c.d.e"
|
||||
|
||||
|
||||
# --- matches_re_end tests ---
|
||||
|
||||
|
||||
class TestMatchesReEnd:
|
||||
def test_simple_no_class_with_runtime(self) -> None:
|
||||
s = "!######tests.test_foo:test_bar:target_func:1:abc:12345######!"
|
||||
m = matches_re_end.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("tests.test_foo", "", "test_bar", "target_func", "1", "abc:12345")
|
||||
|
||||
def test_with_class_no_runtime(self) -> None:
|
||||
s = "!######tests.test_foo:MyClass.test_bar:target_func:1:abc######!"
|
||||
m = matches_re_end.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("tests.test_foo", "MyClass.", "test_bar", "target_func", "1", "abc")
|
||||
|
||||
def test_nested_class_with_runtime(self) -> None:
|
||||
s = "!######mod:A.B.test_x:func:3:id123:99999######!"
|
||||
m = matches_re_end.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("mod", "A.B.", "test_x", "func", "3", "id123:99999")
|
||||
|
||||
def test_runtime_colon_preserved_in_group6(self) -> None:
|
||||
"""Group 6 must capture 'iteration_id:runtime' as a single string (colon included)."""
|
||||
s = "!######m:fn:f:1:iter42:98765######!"
|
||||
m = matches_re_end.search(s)
|
||||
assert m is not None
|
||||
assert m.group(6) == "iter42:98765"
|
||||
|
||||
def test_embedded_in_stdout(self) -> None:
|
||||
s = "captured output\n!######mod:test_fn:f:1:x:500######!\nmore"
|
||||
m = matches_re_end.search(s)
|
||||
assert m is not None
|
||||
assert m.groups() == ("mod", "", "test_fn", "f", "1", "x:500")
|
||||
|
||||
|
||||
# --- Start/End pairing (simulates parse_test_xml matching logic) ---
|
||||
|
||||
|
||||
class TestStartEndPairing:
|
||||
def test_paired_markers(self) -> None:
|
||||
stdout = (
|
||||
"!$######mod:Class.test_fn:func:1:iter1######$!\n"
|
||||
"test output here\n"
|
||||
"!######mod:Class.test_fn:func:1:iter1:54321######!"
|
||||
)
|
||||
starts = list(matches_re_start.finditer(stdout))
|
||||
ends = {}
|
||||
for match in matches_re_end.finditer(stdout):
|
||||
groups = match.groups()
|
||||
g5 = groups[5]
|
||||
colon_pos = g5.find(":")
|
||||
if colon_pos != -1:
|
||||
key = groups[:5] + (g5[:colon_pos],)
|
||||
else:
|
||||
key = groups
|
||||
ends[key] = match
|
||||
|
||||
assert len(starts) == 1
|
||||
assert len(ends) == 1
|
||||
# Start and end should pair on the first 5 groups + iteration_id
|
||||
start_groups = starts[0].groups()
|
||||
assert start_groups in ends
|
||||
|
||||
|
||||
# --- parse_test_failures_from_stdout tests ---
|
||||
|
||||
|
||||
class TestParseTestFailuresHeader:
|
||||
def test_standard_pytest_header(self) -> None:
|
||||
stdout = (
|
||||
"..F.\n"
|
||||
"=================================== FAILURES ===================================\n"
|
||||
"_______ test_foo _______\n"
|
||||
"\n"
|
||||
" def test_foo():\n"
|
||||
"> assert False\n"
|
||||
"E AssertionError\n"
|
||||
"\n"
|
||||
"test.py:3: AssertionError\n"
|
||||
"=========================== short test summary info ============================\n"
|
||||
"FAILED test.py::test_foo\n"
|
||||
)
|
||||
result = parse_test_failures_from_stdout(stdout)
|
||||
assert "test_foo" in result
|
||||
|
||||
def test_minimal_equals(self) -> None:
|
||||
"""Even a short '= FAILURES =' header should be detected."""
|
||||
stdout = (
|
||||
"= FAILURES =\n"
|
||||
"_______ test_bar _______\n"
|
||||
"\n"
|
||||
" assert False\n"
|
||||
"\n"
|
||||
"test.py:1: AssertionError\n"
|
||||
"= short test summary info =\n"
|
||||
)
|
||||
result = parse_test_failures_from_stdout(stdout)
|
||||
assert "test_bar" in result
|
||||
|
||||
def test_no_failures_section(self) -> None:
|
||||
stdout = "....\n4 passed in 0.1s\n"
|
||||
result = parse_test_failures_from_stdout(stdout)
|
||||
assert result == {}
|
||||
|
||||
def test_word_failures_without_equals_is_not_matched(self) -> None:
|
||||
"""'FAILURES' without surrounding '=' signs should not trigger the header detection."""
|
||||
stdout = (
|
||||
"FAILURES detected in module\n"
|
||||
"_______ test_baz _______\n"
|
||||
"\n"
|
||||
" assert False\n"
|
||||
)
|
||||
result = parse_test_failures_from_stdout(stdout)
|
||||
assert result == {}
|
||||
|
||||
def test_failures_in_test_output_not_matched(self) -> None:
|
||||
"""A test printing 'FAILURES' (no = signs) should not trigger header detection."""
|
||||
stdout = (
|
||||
"Testing FAILURES handling\n"
|
||||
"All good\n"
|
||||
)
|
||||
result = parse_test_failures_from_stdout(stdout)
|
||||
assert result == {}
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
# AI Service
|
||||
|
||||
How codeflash communicates with the AI optimization backend.
|
||||
|
||||
## `AiServiceClient` (`api/aiservice.py`)
|
||||
|
||||
The client connects to the AI service at `https://app.codeflash.ai` (or `http://localhost:8000` when `CODEFLASH_AIS_SERVER=local`).
|
||||
|
||||
Authentication uses Bearer token from `get_codeflash_api_key()`. All requests go through `make_ai_service_request()` which handles JSON serialization via Pydantic encoder.
|
||||
|
||||
Timeout: 90s for production, 300s for local.
|
||||
|
||||
## Endpoints
|
||||
|
||||
### `/ai/optimize` — Generate Candidates
|
||||
|
||||
Method: `optimize_code()`
|
||||
|
||||
Sends source code + dependency context to generate optimization candidates.
|
||||
|
||||
Payload:
|
||||
- `source_code` — The read-writable code (markdown format)
|
||||
- `dependency_code` — Read-only context code
|
||||
- `trace_id` — Unique trace ID for the optimization run
|
||||
- `language` — `"python"`, `"javascript"`, or `"typescript"`
|
||||
- `n_candidates` — Number of candidates to generate (controlled by effort level)
|
||||
- `is_async` — Whether the function is async
|
||||
- `is_numerical_code` — Whether the code is numerical (affects optimization strategy)
|
||||
|
||||
Returns: `list[OptimizedCandidate]` with `source=OptimizedCandidateSource.OPTIMIZE`
|
||||
|
||||
### `/ai/optimize_line_profiler` — Line-Profiler-Guided Candidates
|
||||
|
||||
Method: `optimize_python_code_line_profiler()`
|
||||
|
||||
Like `/optimize` but includes `line_profiler_results` to guide the LLM toward hot lines.
|
||||
|
||||
Returns: candidates with `source=OptimizedCandidateSource.OPTIMIZE_LP`
|
||||
|
||||
### `/ai/refine` — Refine Existing Candidate
|
||||
|
||||
Method: `refine_code()`
|
||||
|
||||
Request type: `AIServiceRefinerRequest`
|
||||
|
||||
Sends an existing candidate with runtime data and line profiler results to generate an improved version.
|
||||
|
||||
Key fields:
|
||||
- `original_source_code` / `optimized_source_code` — Before and after
|
||||
- `original_code_runtime` / `optimized_code_runtime` — Timing data
|
||||
- `speedup` — Current speedup ratio
|
||||
- `original_line_profiler_results` / `optimized_line_profiler_results`
|
||||
|
||||
Returns: candidates with `source=OptimizedCandidateSource.REFINE` and `parent_id` set to the refined candidate's ID
|
||||
|
||||
### `/ai/repair` — Fix Failed Candidate
|
||||
|
||||
Method: `repair_code()`
|
||||
|
||||
Request type: `AIServiceCodeRepairRequest`
|
||||
|
||||
Sends a failed candidate with test diffs showing what went wrong.
|
||||
|
||||
Key fields:
|
||||
- `original_source_code` / `modified_source_code`
|
||||
- `test_diffs: list[TestDiff]` — Each with `scope` (return_value/stdout/did_pass), original vs candidate values, and test source code
|
||||
|
||||
Returns: candidates with `source=OptimizedCandidateSource.REPAIR` and `parent_id` set
|
||||
|
||||
### `/ai/adaptive_optimize` — Multi-Candidate Adaptive
|
||||
|
||||
Method: `adaptive_optimize()`
|
||||
|
||||
Request type: `AIServiceAdaptiveOptimizeRequest`
|
||||
|
||||
Sends multiple previous candidates with their speedups for the LLM to learn from and generate better candidates.
|
||||
|
||||
Key fields:
|
||||
- `candidates: list[AdaptiveOptimizedCandidate]` — Previous candidates with source code, explanation, source type, and speedup
|
||||
|
||||
Returns: candidates with `source=OptimizedCandidateSource.ADAPTIVE`
|
||||
|
||||
### `/ai/rewrite_jit` — JIT Rewrite
|
||||
|
||||
Method: `get_jit_rewritten_code()`
|
||||
|
||||
Rewrites code to use JIT compilation (e.g., Numba).
|
||||
|
||||
Returns: candidates with `source=OptimizedCandidateSource.JIT_REWRITE`
|
||||
|
||||
## Candidate Parsing
|
||||
|
||||
All endpoints return JSON with an `optimizations` array. Each entry has:
|
||||
- `source_code` — Markdown-formatted code blocks
|
||||
- `explanation` — LLM explanation
|
||||
- `optimization_id` — Unique ID
|
||||
- `parent_id` — Optional parent reference
|
||||
- `model` — Which LLM model was used
|
||||
|
||||
`_get_valid_candidates()` parses the markdown code via `CodeStringsMarkdown.parse_markdown_code()` and filters out entries with empty code blocks.
|
||||
|
||||
## `LocalAiServiceClient`
|
||||
|
||||
Used when `CODEFLASH_EXPERIMENT_ID` is set. Mirrors `AiServiceClient` but sends to a separate experimental endpoint for A/B testing optimization strategies.
|
||||
|
||||
## LLM Call Sequencing
|
||||
|
||||
`AiServiceClient` tracks call sequence via `llm_call_counter` (itertools.count). Each request includes a `call_sequence` number, used by the backend to maintain conversation context across multiple calls for the same function.
|
||||
|
|
@ -1,79 +0,0 @@
|
|||
# Configuration
|
||||
|
||||
Key configuration constants, effort levels, and thresholds.
|
||||
|
||||
## Constants (`code_utils/config_consts.py`)
|
||||
|
||||
### Test Execution
|
||||
|
||||
| Constant | Value | Description |
|
||||
|----------|-------|-------------|
|
||||
| `MAX_TEST_RUN_ITERATIONS` | 5 | Maximum test loop iterations |
|
||||
| `INDIVIDUAL_TESTCASE_TIMEOUT` | 15s | Timeout per individual test case |
|
||||
| `MAX_FUNCTION_TEST_SECONDS` | 60s | Max total time for function testing |
|
||||
| `MAX_TEST_FUNCTION_RUNS` | 50 | Max test function executions |
|
||||
| `MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS` | 100ms | Max cumulative test runtime |
|
||||
| `TOTAL_LOOPING_TIME` | 10s | Candidate benchmarking budget |
|
||||
| `MIN_TESTCASE_PASSED_THRESHOLD` | 6 | Minimum test cases that must pass |
|
||||
|
||||
### Performance Thresholds
|
||||
|
||||
| Constant | Value | Description |
|
||||
|----------|-------|-------------|
|
||||
| `MIN_IMPROVEMENT_THRESHOLD` | 0.05 (5%) | Minimum speedup to accept a candidate |
|
||||
| `MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD` | 0.10 (10%) | Minimum async throughput improvement |
|
||||
| `MIN_CONCURRENCY_IMPROVEMENT_THRESHOLD` | 0.20 (20%) | Minimum concurrency ratio improvement |
|
||||
| `COVERAGE_THRESHOLD` | 60.0% | Minimum test coverage |
|
||||
|
||||
### Stability Thresholds
|
||||
|
||||
| Constant | Value | Description |
|
||||
|----------|-------|-------------|
|
||||
| `STABILITY_WINDOW_SIZE` | 0.35 | 35% of total iteration window |
|
||||
| `STABILITY_CENTER_TOLERANCE` | 0.0025 | ±0.25% around median |
|
||||
| `STABILITY_SPREAD_TOLERANCE` | 0.0025 | 0.25% window spread |
|
||||
|
||||
### Context Limits
|
||||
|
||||
| Constant | Value | Description |
|
||||
|----------|-------|-------------|
|
||||
| `OPTIMIZATION_CONTEXT_TOKEN_LIMIT` | 16000 | Max tokens for optimization context |
|
||||
| `TESTGEN_CONTEXT_TOKEN_LIMIT` | 16000 | Max tokens for test generation context |
|
||||
| `MAX_CONTEXT_LEN_REVIEW` | 1000 | Max context length for optimization review |
|
||||
|
||||
### Other
|
||||
|
||||
| Constant | Value | Description |
|
||||
|----------|-------|-------------|
|
||||
| `MIN_CORRECT_CANDIDATES` | 2 | Min correct candidates before skipping repair |
|
||||
| `REPEAT_OPTIMIZATION_PROBABILITY` | 0.1 | Probability of re-optimizing a function |
|
||||
| `DEFAULT_IMPORTANCE_THRESHOLD` | 0.001 | Minimum addressable time to consider a function |
|
||||
| `CONCURRENCY_FACTOR` | 10 | Number of concurrent executions for concurrency benchmark |
|
||||
| `REFINED_CANDIDATE_RANKING_WEIGHTS` | (2, 1) | (runtime, diff) weights — runtime 2x more important |
|
||||
|
||||
## Effort Levels
|
||||
|
||||
`EffortLevel` enum: `LOW`, `MEDIUM`, `HIGH`
|
||||
|
||||
Effort controls the number of candidates, repairs, and refinements:
|
||||
|
||||
| Key | LOW | MEDIUM | HIGH |
|
||||
|-----|-----|--------|------|
|
||||
| `N_OPTIMIZER_CANDIDATES` | 3 | 5 | 6 |
|
||||
| `N_OPTIMIZER_LP_CANDIDATES` | 4 | 6 | 7 |
|
||||
| `N_GENERATED_TESTS` | 2 | 2 | 2 |
|
||||
| `MAX_CODE_REPAIRS_PER_TRACE` | 2 | 3 | 5 |
|
||||
| `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` | 0.2 | 0.3 | 0.4 |
|
||||
| `TOP_VALID_CANDIDATES_FOR_REFINEMENT` | 2 | 3 | 4 |
|
||||
| `ADAPTIVE_OPTIMIZATION_THRESHOLD` | 0 | 0 | 2 |
|
||||
| `MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE` | 0 | 0 | 4 |
|
||||
|
||||
Use `get_effort_value(EffortKeys.KEY, effort_level)` to retrieve values.
|
||||
|
||||
## Project Configuration
|
||||
|
||||
Configuration is read from `pyproject.toml` under `[tool.codeflash]`. Key settings are auto-detected by `setup/detector.py`:
|
||||
- `module-root` — Root of the module to optimize
|
||||
- `tests-root` — Root of test files
|
||||
- `test-framework` — pytest, unittest, jest, etc.
|
||||
- `formatter-cmds` — Code formatting commands
|
||||
|
|
@ -1,60 +0,0 @@
|
|||
# Context Extraction
|
||||
|
||||
How codeflash extracts and limits code context for optimization and test generation.
|
||||
|
||||
## Overview
|
||||
|
||||
Context extraction (`context/code_context_extractor.py`) builds a `CodeOptimizationContext` containing all code needed for the LLM to understand and optimize a function, split into:
|
||||
|
||||
- **Read-writable code** (`CodeContextType.READ_WRITABLE`): The function being optimized plus its helper functions — code the LLM is allowed to modify
|
||||
- **Read-only context** (`CodeContextType.READ_ONLY`): Dependency code for reference — imports, type definitions, base classes
|
||||
- **Testgen context** (`CodeContextType.TESTGEN`): Context for test generation, may include imported class definitions and external base class inits
|
||||
- **Hashing context** (`CodeContextType.HASHING`): Used for deduplication of optimization runs
|
||||
|
||||
## Token Limits
|
||||
|
||||
Both optimization and test generation contexts are token-limited:
|
||||
- `OPTIMIZATION_CONTEXT_TOKEN_LIMIT = 16000` tokens
|
||||
- `TESTGEN_CONTEXT_TOKEN_LIMIT = 16000` tokens
|
||||
|
||||
Token counting uses `encoded_tokens_len()` from `code_utils/code_utils.py`. Functions whose context exceeds these limits are skipped.
|
||||
|
||||
## Context Building Process
|
||||
|
||||
### 1. Helper Discovery
|
||||
|
||||
For the target function (`FunctionToOptimize`), the extractor finds:
|
||||
- **Helpers of the function**: Functions/classes in the same file that the target function calls
|
||||
- **Helpers of helpers**: Transitive dependencies of the helper functions
|
||||
|
||||
These are organized as `dict[Path, set[FunctionSource]]` — mapping file paths to the set of helper functions found in each file.
|
||||
|
||||
### 2. Code Extraction
|
||||
|
||||
`extract_code_markdown_context_from_files()` builds `CodeStringsMarkdown` from the helper dictionaries. Each file's relevant code is extracted as a `CodeString` with its file path.
|
||||
|
||||
### 3. Testgen Context Enrichment
|
||||
|
||||
`build_testgen_context()` extends the basic context with:
|
||||
- Imported class definitions (resolved from imports)
|
||||
- External base class `__init__` methods
|
||||
- External class `__init__` methods referenced in the context
|
||||
|
||||
### 4. Unused Definition Removal
|
||||
|
||||
`detect_unused_helper_functions()` and `remove_unused_definitions_by_function_names()` from `context/unused_definition_remover.py` prune definitions that are not transitively reachable from the target function, reducing token usage.
|
||||
|
||||
### 5. Deduplication
|
||||
|
||||
The hashing context (`hashing_code_context`) generates a hash (`hashing_code_context_hash`) used to detect when the same function context has already been optimized in a previous run, avoiding redundant work.
|
||||
|
||||
## Key Functions
|
||||
|
||||
| Function | Location | Purpose |
|
||||
|----------|----------|---------|
|
||||
| `build_testgen_context()` | `context/code_context_extractor.py` | Build enriched testgen context |
|
||||
| `extract_code_markdown_context_from_files()` | `context/code_context_extractor.py` | Convert helper dicts to `CodeStringsMarkdown` |
|
||||
| `detect_unused_helper_functions()` | `context/unused_definition_remover.py` | Find unused definitions |
|
||||
| `remove_unused_definitions_by_function_names()` | `context/unused_definition_remover.py` | Remove unused definitions |
|
||||
| `collect_top_level_defs_with_usages()` | `context/unused_definition_remover.py` | Analyze definition usage |
|
||||
| `encoded_tokens_len()` | `code_utils/code_utils.py` | Count tokens in code |
|
||||
|
|
@ -1,153 +0,0 @@
|
|||
# Domain Types
|
||||
|
||||
Core data types used throughout the codeflash optimization pipeline.
|
||||
|
||||
## Function Representation
|
||||
|
||||
### `FunctionToOptimize` (`models/function_types.py`)
|
||||
|
||||
The canonical dataclass representing a function candidate for optimization. Works across Python, JavaScript, and TypeScript.
|
||||
|
||||
Key fields:
|
||||
- `function_name: str` — The function name
|
||||
- `file_path: Path` — Absolute file path where the function is located
|
||||
- `parents: list[FunctionParent]` — Parent scopes (classes/functions), each with `name` and `type`
|
||||
- `starting_line / ending_line: Optional[int]` — Line range (1-indexed)
|
||||
- `is_async: bool` — Whether the function is async
|
||||
- `is_method: bool` — Whether it belongs to a class
|
||||
- `language: str` — Programming language (default: `"python"`)
|
||||
|
||||
Key properties:
|
||||
- `qualified_name` — Full dotted name including parent classes (e.g., `MyClass.my_method`)
|
||||
- `top_level_parent_name` — Name of outermost parent, or function name if no parents
|
||||
- `class_name` — Immediate parent class name, or `None`
|
||||
|
||||
### `FunctionParent` (`models/function_types.py`)
|
||||
|
||||
Represents a parent scope: `name: str` (e.g., `"MyClass"`) and `type: str` (e.g., `"ClassDef"`).
|
||||
|
||||
### `FunctionSource` (`models/models.py`)
|
||||
|
||||
Represents a resolved function with source code. Used for helper functions in context extraction.
|
||||
|
||||
Fields: `file_path`, `qualified_name`, `fully_qualified_name`, `only_function_name`, `source_code`, `jedi_definition`.
|
||||
|
||||
## Code Representation
|
||||
|
||||
### `CodeString` (`models/models.py`)
|
||||
|
||||
A single code block with validated syntax:
|
||||
- `code: str` — The source code
|
||||
- `file_path: Optional[Path]` — Origin file path
|
||||
- `language: str` — Language for validation (default: `"python"`)
|
||||
|
||||
Validates syntax on construction via `model_validator`.
|
||||
|
||||
### `CodeStringsMarkdown` (`models/models.py`)
|
||||
|
||||
A collection of `CodeString` blocks — the primary format for passing code through the pipeline.
|
||||
|
||||
Key properties:
|
||||
- `.flat` — Combined source code with file-path comment prefixes (e.g., `# file: path/to/file.py`)
|
||||
- `.markdown` — Markdown-formatted with fenced code blocks: `` ```python:filepath\ncode\n``` ``
|
||||
- `.file_to_path()` — Dict mapping file path strings to code
|
||||
|
||||
Static method:
|
||||
- `parse_markdown_code(markdown_code, expected_language)` — Parses markdown code blocks back into `CodeStringsMarkdown`
|
||||
|
||||
## Optimization Context
|
||||
|
||||
### `CodeOptimizationContext` (`models/models.py`)
|
||||
|
||||
Holds all code context needed for optimization:
|
||||
- `read_writable_code: CodeStringsMarkdown` — Code the LLM can modify
|
||||
- `read_only_context_code: str` — Reference-only dependency code
|
||||
- `testgen_context: CodeStringsMarkdown` — Context for test generation
|
||||
- `hashing_code_context: str` / `hashing_code_context_hash: str` — For deduplication
|
||||
- `helper_functions: list[FunctionSource]` — Helper functions in the writable code
|
||||
- `preexisting_objects: set[tuple[str, tuple[FunctionParent, ...]]]` — Objects that already exist in the code
|
||||
|
||||
### `CodeContextType` enum (`models/models.py`)
|
||||
|
||||
Defines context categories: `READ_WRITABLE`, `READ_ONLY`, `TESTGEN`, `HASHING`.
|
||||
|
||||
## Candidates
|
||||
|
||||
### `OptimizedCandidate` (`models/models.py`)
|
||||
|
||||
A generated code variant:
|
||||
- `source_code: CodeStringsMarkdown` — The optimized code
|
||||
- `explanation: str` — LLM explanation of the optimization
|
||||
- `optimization_id: str` — Unique identifier
|
||||
- `source: OptimizedCandidateSource` — How it was generated
|
||||
- `parent_id: str | None` — ID of parent candidate (for refinements/repairs)
|
||||
- `model: str | None` — Which LLM model generated it
|
||||
|
||||
### `OptimizedCandidateSource` enum (`models/models.py`)
|
||||
|
||||
How a candidate was generated: `OPTIMIZE`, `OPTIMIZE_LP` (line profiler), `REFINE`, `REPAIR`, `ADAPTIVE`, `JIT_REWRITE`.
|
||||
|
||||
### `CandidateEvaluationContext` (`models/models.py`)
|
||||
|
||||
Tracks state during candidate evaluation:
|
||||
- `speedup_ratios` / `optimized_runtimes` / `is_correct` — Per-candidate results
|
||||
- `ast_code_to_id` — Deduplication map (normalized AST → first seen candidate)
|
||||
- `valid_optimizations` — Candidates that passed all checks
|
||||
|
||||
Key methods: `record_failed_candidate()`, `record_successful_candidate()`, `handle_duplicate_candidate()`, `register_new_candidate()`.
|
||||
|
||||
## Baseline & Results
|
||||
|
||||
### `OriginalCodeBaseline` (`models/models.py`)
|
||||
|
||||
Baseline measurements for the original code:
|
||||
- `behavior_test_results: TestResults` / `benchmarking_test_results: TestResults`
|
||||
- `line_profile_results: dict`
|
||||
- `runtime: int` — Total runtime in nanoseconds
|
||||
- `coverage_results: Optional[CoverageData]`
|
||||
|
||||
### `BestOptimization` (`models/models.py`)
|
||||
|
||||
The winning candidate after evaluation:
|
||||
- `candidate: OptimizedCandidate`
|
||||
- `helper_functions: list[FunctionSource]`
|
||||
- `code_context: CodeOptimizationContext`
|
||||
- `runtime: int`
|
||||
- `winning_behavior_test_results` / `winning_benchmarking_test_results: TestResults`
|
||||
|
||||
## Test Types
|
||||
|
||||
### `TestType` enum (`models/test_type.py`)
|
||||
|
||||
- `EXISTING_UNIT_TEST` (1) — Pre-existing tests from the codebase
|
||||
- `INSPIRED_REGRESSION` (2) — Tests inspired by existing tests
|
||||
- `GENERATED_REGRESSION` (3) — AI-generated regression tests
|
||||
- `REPLAY_TEST` (4) — Tests from recorded benchmark data
|
||||
- `CONCOLIC_COVERAGE_TEST` (5) — Coverage-guided tests
|
||||
- `INIT_STATE_TEST` (6) — Class init state verification
|
||||
|
||||
### `TestFile` / `TestFiles` (`models/models.py`)
|
||||
|
||||
`TestFile` represents a single test file with `instrumented_behavior_file_path`, optional `benchmarking_file_path`, `original_file_path`, `test_type`, and `tests_in_file`.
|
||||
|
||||
`TestFiles` is a collection with lookup methods: `get_by_type()`, `get_by_original_file_path()`, `get_test_type_by_instrumented_file_path()`.
|
||||
|
||||
### `TestResults` (`models/models.py`)
|
||||
|
||||
Collection of `FunctionTestInvocation` results with indexed lookup. Key methods:
|
||||
- `add(invocation)` — Deduplicated insert
|
||||
- `total_passed_runtime()` — Sum of minimum runtimes per test case (nanoseconds)
|
||||
- `number_of_loops()` — Max loop index across all results
|
||||
- `usable_runtime_data_by_test_case()` — Dict of invocation ID → list of runtimes
|
||||
|
||||
## Result Type
|
||||
|
||||
### `Result[L, R]` / `Success` / `Failure` (`either.py`)
|
||||
|
||||
Functional error handling type:
|
||||
- `Success(value)` — Wraps a successful result
|
||||
- `Failure(error)` — Wraps an error
|
||||
- `result.is_successful()` / `result.is_failure()` — Check type
|
||||
- `result.unwrap()` — Get success value (raises if Failure)
|
||||
- `result.failure()` — Get failure value (raises if Success)
|
||||
- `is_successful(result)` — Module-level helper function
|
||||
|
|
@ -1,41 +0,0 @@
|
|||
# Codeflash Internal Documentation
|
||||
|
||||
CodeFlash is an AI-powered Python code optimizer that automatically improves code performance while maintaining correctness. It uses LLMs to generate optimization candidates, verifies correctness through test execution, and benchmarks performance improvements.
|
||||
|
||||
## Pipeline Overview
|
||||
|
||||
```
|
||||
Discovery → Ranking → Context Extraction → Test Gen + Optimization → Baseline → Candidate Evaluation → PR
|
||||
```
|
||||
|
||||
1. **Discovery** (`discovery/`): Find optimizable functions across the codebase using `FunctionVisitor`
|
||||
2. **Ranking** (`benchmarking/function_ranker.py`): Rank functions by addressable time using trace data
|
||||
3. **Context** (`context/`): Extract code dependencies — split into read-writable (modifiable) and read-only (reference)
|
||||
4. **Optimization** (`optimization/`, `api/`): Generate candidates via AI service, runs concurrently with test generation
|
||||
5. **Verification** (`verification/`): Run candidates against tests via custom pytest plugin, compare outputs
|
||||
6. **Benchmarking** (`benchmarking/`): Measure performance, select best candidate by speedup
|
||||
7. **Result** (`result/`, `github/`): Create PR with winning optimization
|
||||
|
||||
## Key Entry Points
|
||||
|
||||
| Task | File |
|
||||
|------|------|
|
||||
| CLI arguments & commands | `cli_cmds/cli.py` |
|
||||
| Optimization orchestration | `optimization/optimizer.py` → `Optimizer.run()` |
|
||||
| Per-function optimization | `optimization/function_optimizer.py` → `FunctionOptimizer` |
|
||||
| Function discovery | `discovery/functions_to_optimize.py` |
|
||||
| Context extraction | `context/code_context_extractor.py` |
|
||||
| Test execution | `verification/test_runner.py`, `verification/pytest_plugin.py` |
|
||||
| Performance ranking | `benchmarking/function_ranker.py` |
|
||||
| Domain types | `models/models.py`, `models/function_types.py` |
|
||||
| AI service | `api/aiservice.py` → `AiServiceClient` |
|
||||
| Configuration | `code_utils/config_consts.py` |
|
||||
|
||||
## Documentation Pages
|
||||
|
||||
- [Domain Types](domain-types.md) — Core data types and their relationships
|
||||
- [Optimization Pipeline](optimization-pipeline.md) — Step-by-step data flow through the pipeline
|
||||
- [Context Extraction](context-extraction.md) — How code context is extracted and token-limited
|
||||
- [Verification](verification.md) — Test execution, pytest plugin, deterministic patches
|
||||
- [AI Service](ai-service.md) — AI service client endpoints and request types
|
||||
- [Configuration](configuration.md) — Config schema, effort levels, thresholds
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
# Optimization Pipeline
|
||||
|
||||
Step-by-step data flow from function discovery to PR creation.
|
||||
|
||||
## 1. Entry Point: `Optimizer.run()` (`optimization/optimizer.py`)
|
||||
|
||||
The `Optimizer` class is initialized with CLI args and creates:
|
||||
- `TestConfig` with test roots, project root, pytest command
|
||||
- `AiServiceClient` for AI service communication
|
||||
- Optional `LocalAiServiceClient` for experiments
|
||||
|
||||
`run()` orchestrates the full pipeline: discovers functions, optionally ranks them, then optimizes each in turn.
|
||||
|
||||
## 2. Function Discovery (`discovery/functions_to_optimize.py`)
|
||||
|
||||
`FunctionVisitor` traverses source files to find optimizable functions, producing `FunctionToOptimize` instances. Filters include:
|
||||
- Skipping functions that are too small or trivial
|
||||
- Skipping previously optimized functions (via `was_function_previously_optimized()`)
|
||||
- Applying user-configured include/exclude patterns
|
||||
|
||||
## 3. Function Ranking (`benchmarking/function_ranker.py`)
|
||||
|
||||
When trace data is available, `FunctionRanker` ranks functions by **addressable time** — the time a function spends that could be optimized (own time + callee time / call count). Functions below `DEFAULT_IMPORTANCE_THRESHOLD=0.001` are skipped.
|
||||
|
||||
## 4. Per-Function Optimization: `FunctionOptimizer` (`optimization/function_optimizer.py`)
|
||||
|
||||
For each function, `FunctionOptimizer.optimize_function()` runs the full optimization loop:
|
||||
|
||||
### 4a. Context Extraction (`context/code_context_extractor.py`)
|
||||
|
||||
Extracts `CodeOptimizationContext` containing:
|
||||
- `read_writable_code` — Code the LLM can modify (the function + helpers)
|
||||
- `read_only_context_code` — Dependency code for reference only
|
||||
- `testgen_context` — Context for test generation (may include imported class definitions)
|
||||
|
||||
Token limits are enforced: `OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000` and `TESTGEN_CONTEXT_TOKEN_LIMIT=16000`. Functions exceeding these are rejected.
|
||||
|
||||
### 4b. Concurrent Test Generation + LLM Optimization
|
||||
|
||||
These run in parallel using `concurrent.futures`:
|
||||
- **Test generation**: Generates regression tests from the function context
|
||||
- **LLM optimization**: Sends `read_writable_code.markdown` + `read_only_context_code` to the AI service
|
||||
|
||||
The number of candidates depends on effort level (see Configuration docs).
|
||||
|
||||
### 4c. Candidate Evaluation
|
||||
|
||||
For each `OptimizedCandidate`:
|
||||
|
||||
1. **Deduplication**: Normalize code AST and check against `CandidateEvaluationContext.ast_code_to_id`. If duplicate, copy results from previous evaluation.
|
||||
|
||||
2. **Code replacement**: Replace the original function with the candidate using `replace_function_definitions_in_module()`.
|
||||
|
||||
3. **Behavioral testing**: Run instrumented tests in subprocess. The custom pytest plugin applies deterministic patches. Compare return values, stdout, and pass/fail status against the original baseline.
|
||||
|
||||
4. **Benchmarking**: If behavior matches, run performance tests with looping (`TOTAL_LOOPING_TIME=10s`). Calculate speedup ratio.
|
||||
|
||||
5. **Validation**: Candidate must beat `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup) and pass stability checks.
|
||||
|
||||
### 4d. Refinement & Repair
|
||||
|
||||
- **Repair**: If fewer than `MIN_CORRECT_CANDIDATES=2` pass, failed candidates can be repaired via `AIServiceCodeRepairRequest` (sends test diffs to LLM).
|
||||
- **Refinement**: Top valid candidates are refined via `AIServiceRefinerRequest` (sends runtime data, line profiler results).
|
||||
- **Adaptive**: At HIGH effort, additional adaptive optimization rounds via `AIServiceAdaptiveOptimizeRequest`.
|
||||
|
||||
### 4e. Best Candidate Selection
|
||||
|
||||
The winning candidate is selected by:
|
||||
1. Highest speedup ratio
|
||||
2. For tied speedups, shortest diff length from original
|
||||
3. Refinement candidates use weighted ranking: `(2 * runtime_rank + 1 * diff_rank)`
|
||||
|
||||
Result is a `BestOptimization` with the candidate, context, test results, and runtime.
|
||||
|
||||
## 5. PR Creation (`github/`)
|
||||
|
||||
If a winning candidate is found, a PR is created with:
|
||||
- The optimized code diff
|
||||
- Performance benchmark details
|
||||
- Explanation from the LLM
|
||||
|
||||
## Worktree Mode
|
||||
|
||||
When `--worktree` is enabled, optimization runs in an isolated git worktree (`code_utils/git_worktree_utils.py`). This allows parallel optimization without affecting the working tree. Changes are captured as patch files.
|
||||
|
|
@ -1,93 +0,0 @@
|
|||
# Verification
|
||||
|
||||
How codeflash verifies candidate correctness and measures performance.
|
||||
|
||||
## Test Execution Architecture
|
||||
|
||||
Tests are executed in a **subprocess** to isolate the test environment from the main codeflash process. The test runner (`verification/test_runner.py`) invokes pytest (or Jest for JS/TS) with specific plugin configurations.
|
||||
|
||||
### Plugin Blocklists
|
||||
|
||||
- **Behavioral tests**: Block `benchmark`, `codspeed`, `xdist`, `sugar`
|
||||
- **Benchmarking tests**: Block `codspeed`, `cov`, `benchmark`, `profiling`, `xdist`, `sugar`
|
||||
|
||||
These are defined as `BEHAVIORAL_BLOCKLISTED_PLUGINS` and `BENCHMARKING_BLOCKLISTED_PLUGINS` in `verification/test_runner.py`.
|
||||
|
||||
## Custom Pytest Plugin (`verification/pytest_plugin.py`)
|
||||
|
||||
The plugin is loaded into the test subprocess and provides:
|
||||
|
||||
### Deterministic Patches
|
||||
|
||||
`_apply_deterministic_patches()` replaces non-deterministic functions with fixed values to ensure reproducible test output:
|
||||
|
||||
| Module | Function | Fixed Value |
|
||||
|--------|----------|-------------|
|
||||
| `time` | `time()` | `1761717605.108106` |
|
||||
| `time` | `perf_counter()` | Incrementing by 1ms per call |
|
||||
| `datetime` | `datetime.now()` | `2021-01-01 02:05:10 UTC` |
|
||||
| `datetime` | `datetime.utcnow()` | `2021-01-01 02:05:10 UTC` |
|
||||
| `uuid` | `uuid4()` / `uuid1()` | `12345678-1234-5678-9abc-123456789012` |
|
||||
| `random` | `random()` | `0.123456789` (seeded with 42) |
|
||||
| `os` | `urandom(n)` | `b"\x42" * n` |
|
||||
| `numpy.random` | seed | `42` |
|
||||
|
||||
Patches call the original function first to maintain performance characteristics (same call overhead).
|
||||
|
||||
### Timing Markers
|
||||
|
||||
Test results include timing markers in stdout: `!######<id>:<duration_ns>######!`
|
||||
|
||||
The pattern `_TIMING_MARKER_PATTERN` extracts timing data for calculating function utilization fraction.
|
||||
|
||||
### Loop Stability
|
||||
|
||||
Performance benchmarking uses configurable stability thresholds:
|
||||
- `STABILITY_WINDOW_SIZE = 0.35` (35% of total iterations)
|
||||
- `STABILITY_CENTER_TOLERANCE = 0.0025` (±0.25% around median)
|
||||
- `STABILITY_SPREAD_TOLERANCE = 0.0025` (0.25% window spread)
|
||||
|
||||
### Memory Limits (Linux)
|
||||
|
||||
On Linux, the plugin sets `RLIMIT_AS` to 85% of total system memory (RAM + swap) to prevent OOM kills.
|
||||
|
||||
## Test Result Processing
|
||||
|
||||
### `TestResults` (`models/models.py`)
|
||||
|
||||
Collects `FunctionTestInvocation` results with:
|
||||
- Deduplicated insertion via `unique_invocation_loop_id`
|
||||
- `total_passed_runtime()` — Sum of minimum runtimes per test case (nanoseconds)
|
||||
- `number_of_loops()` — Max loop index
|
||||
- `usable_runtime_data_by_test_case()` — Grouped timing data
|
||||
|
||||
### `FunctionTestInvocation`
|
||||
|
||||
Each invocation records:
|
||||
- `loop_index` — Iteration number (starts at 1)
|
||||
- `id: InvocationId` — Fully qualified test identifier
|
||||
- `did_pass: bool` — Pass/fail status
|
||||
- `runtime: Optional[int]` — Time in nanoseconds
|
||||
- `return_value: Optional[object]` — Captured return value
|
||||
- `test_type: TestType` — Which test category
|
||||
|
||||
### Behavioral vs Performance Testing
|
||||
|
||||
1. **Behavioral**: Runs with `TestingMode.BEHAVIOR`. Compares return values and stdout between original and candidate. Any difference = candidate rejected.
|
||||
2. **Performance**: Runs with `TestingMode.PERFORMANCE`. Loops for `TOTAL_LOOPING_TIME=10s` to get stable timing. Calculates speedup ratio.
|
||||
3. **Line Profile**: Runs with `TestingMode.LINE_PROFILE`. Collects per-line timing data for refinement.
|
||||
|
||||
## Test Types
|
||||
|
||||
| TestType | Value | Description |
|
||||
|----------|-------|-------------|
|
||||
| `EXISTING_UNIT_TEST` | 1 | Pre-existing tests from the codebase |
|
||||
| `INSPIRED_REGRESSION` | 2 | Tests inspired by existing tests |
|
||||
| `GENERATED_REGRESSION` | 3 | AI-generated regression tests |
|
||||
| `REPLAY_TEST` | 4 | Tests from recorded benchmark data |
|
||||
| `CONCOLIC_COVERAGE_TEST` | 5 | Coverage-guided tests |
|
||||
| `INIT_STATE_TEST` | 6 | Class init state verification |
|
||||
|
||||
## Coverage
|
||||
|
||||
Coverage is measured via `CoverageData` with a threshold of `COVERAGE_THRESHOLD=60.0%`. Low coverage may affect confidence in the optimization's correctness.
|
||||
|
|
@ -1,118 +0,0 @@
|
|||
{
|
||||
"package_name": "codeflash-docs",
|
||||
"total_capabilities": 16,
|
||||
"capabilities": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "pipeline-stage-ordering",
|
||||
"description": "Know the correct ordering of codeflash pipeline stages: Discovery → Ranking → Context Extraction → Test Gen + Optimization (concurrent) → Baseline → Candidate Evaluation → PR",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["Optimizer.run()", "FunctionOptimizer.optimize_function()"]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"name": "function-to-optimize-fields",
|
||||
"description": "Know FunctionToOptimize key fields (function_name, file_path, parents, starting_line/ending_line, is_async, is_method, language) and properties (qualified_name, top_level_parent_name, class_name)",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["FunctionToOptimize", "FunctionParent", "models/function_types.py"]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"name": "code-strings-markdown-format",
|
||||
"description": "Know that code is serialized as markdown fenced blocks with language:filepath syntax (```python:filepath\\ncode\\n```) and parsed via CodeStringsMarkdown.parse_markdown_code()",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["CodeStringsMarkdown", "CodeString", ".markdown", ".flat", "parse_markdown_code()"]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"name": "read-writable-vs-read-only",
|
||||
"description": "Distinguish read_writable_code (LLM can modify) from read_only_context_code (reference only) in CodeOptimizationContext",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["CodeOptimizationContext", "read_writable_code", "read_only_context_code"]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"name": "candidate-source-types",
|
||||
"description": "Know OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE and when each is used",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["OptimizedCandidateSource", "OptimizedCandidate"]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"name": "candidate-forest-dag",
|
||||
"description": "Know that candidates form a forest/DAG via parent_id references where refinements and repairs build on previous candidates",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["parent_id", "OptimizedCandidate", "CandidateForest"]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"name": "concurrent-testgen-optimization",
|
||||
"description": "Know that test generation and LLM optimization run concurrently using concurrent.futures, not sequentially",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["concurrent.futures", "FunctionOptimizer.optimize_function()"]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"name": "deterministic-patch-values",
|
||||
"description": "Know the specific fixed values used by deterministic patches: time=1761717605.108106, datetime=2021-01-01 02:05:10 UTC, uuid=12345678-1234-5678-9abc-123456789012, random seeded with 42",
|
||||
"complexity": "advanced",
|
||||
"api_elements": ["_apply_deterministic_patches()", "pytest_plugin.py"]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"name": "test-type-enum",
|
||||
"description": "Know the 6 TestType variants: EXISTING_UNIT_TEST, INSPIRED_REGRESSION, GENERATED_REGRESSION, REPLAY_TEST, CONCOLIC_COVERAGE_TEST, INIT_STATE_TEST",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["TestType", "models/test_type.py"]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"name": "ai-service-endpoints",
|
||||
"description": "Know the AI service endpoints: /ai/optimize, /ai/optimize_line_profiler, /ai/refine, /ai/repair, /ai/adaptive_optimize, /ai/rewrite_jit",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["AiServiceClient", "api/aiservice.py"]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"name": "repair-request-structure",
|
||||
"description": "Know that AIServiceCodeRepairRequest includes TestDiff objects with scope (RETURN_VALUE/STDOUT/DID_PASS), original vs candidate values, and test source code",
|
||||
"complexity": "advanced",
|
||||
"api_elements": ["AIServiceCodeRepairRequest", "TestDiff", "TestDiffScope"]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"name": "effort-level-values",
|
||||
"description": "Know specific effort level values: LOW gets 3 candidates, MEDIUM gets 5, HIGH gets 6 (N_OPTIMIZER_CANDIDATES)",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["EffortLevel", "N_OPTIMIZER_CANDIDATES", "EFFORT_VALUES"]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"name": "context-token-limits",
|
||||
"description": "Know OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000 and TESTGEN_CONTEXT_TOKEN_LIMIT=16000 and that encoded_tokens_len() is used for counting",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"name": "best-candidate-selection",
|
||||
"description": "Know the selection criteria: highest speedup, then shortest diff for ties, and refinement weighted ranking (2*runtime + 1*diff)",
|
||||
"complexity": "advanced",
|
||||
"api_elements": ["BestOptimization", "REFINED_CANDIDATE_RANKING_WEIGHTS"]
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"name": "plugin-blocklists",
|
||||
"description": "Know behavioral test blocklisted plugins (benchmark, codspeed, xdist, sugar) and benchmarking blocklist (adds cov, profiling)",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["BEHAVIORAL_BLOCKLISTED_PLUGINS", "BENCHMARKING_BLOCKLISTED_PLUGINS"]
|
||||
},
|
||||
{
|
||||
"id": 15,
|
||||
"name": "result-type-usage",
|
||||
"description": "Know that Result[L,R] from either.py uses Success(value)/Failure(error) with is_successful() check before unwrap()",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["Result", "Success", "Failure", "is_successful", "either.py"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1 +0,0 @@
|
|||
Code serialization format and context splitting
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent knows the CodeStringsMarkdown serialization format and the distinction between read-writable and read-only code context in the codeflash pipeline.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Markdown code block format",
|
||||
"description": "Uses the correct fenced code block format with language:filepath syntax (```python:path/to/file.py) when constructing code for the AI service, NOT plain code blocks without file paths",
|
||||
"max_score": 30
|
||||
},
|
||||
{
|
||||
"name": "Read-writable vs read-only split",
|
||||
"description": "Correctly separates code into read_writable_code (code the LLM can modify) and read_only_context_code (reference-only dependency code), NOT treating all code as modifiable",
|
||||
"max_score": 35
|
||||
},
|
||||
{
|
||||
"name": "parse_markdown_code usage",
|
||||
"description": "Uses CodeStringsMarkdown.parse_markdown_code() to parse AI service responses back into structured code, NOT manual string splitting or regex",
|
||||
"max_score": 35
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
# Format Code for AI Service Request
|
||||
|
||||
## Context
|
||||
|
||||
You are working on the codeflash optimization engine. The AI service accepts optimization requests with source code and dependency context. A function `calculate_total` in `analytics/metrics.py` needs to be optimized. It calls a helper `normalize_values` in the same file (both modifiable), and imports `BaseMetric` from `analytics/base.py` (not modifiable, just for reference).
|
||||
|
||||
```python
|
||||
# analytics/metrics.py
|
||||
from analytics.base import BaseMetric
|
||||
|
||||
def normalize_values(data: list[float]) -> list[float]:
|
||||
max_val = max(data)
|
||||
return [x / max_val for x in data]
|
||||
|
||||
def calculate_total(metrics: list[BaseMetric]) -> float:
|
||||
values = [m.value for m in metrics]
|
||||
normalized = normalize_values(values)
|
||||
return sum(normalized)
|
||||
```
|
||||
|
||||
```python
|
||||
# analytics/base.py
|
||||
class BaseMetric:
|
||||
def __init__(self, name: str, value: float):
|
||||
self.name = name
|
||||
self.value = value
|
||||
```
|
||||
|
||||
## Task
|
||||
|
||||
Write a Python function `prepare_optimization_payload` that constructs the code payload for an AI service optimization request for `calculate_total`. It should properly format the source code and dependency code, and include a function to parse the AI service response back into structured code objects.
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- A Python file `payload_builder.py` with the payload construction and response parsing logic
|
||||
|
|
@ -1 +0,0 @@
|
|||
Candidate source types and DAG relationships
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Lists source types",
|
||||
"description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Parent ID linkage",
|
||||
"description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Refinement uses runtime data",
|
||||
"description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Repair uses test diffs",
|
||||
"description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages",
|
||||
"max_score": 25
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Document the Candidate Lifecycle
|
||||
|
||||
## Context
|
||||
|
||||
A new engineer is joining the codeflash team and needs to understand how optimization candidates are generated, improved, and related to each other throughout the pipeline. They've asked for a clear explanation of the different ways candidates are produced and how the system iterates on them.
|
||||
|
||||
## Task
|
||||
|
||||
Write a technical document explaining the full lifecycle of an optimization candidate in codeflash — from initial generation through improvement iterations. Cover all the different ways candidates can be created, what data is sent to the AI service for each type, and how candidates relate to each other structurally.
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- A markdown file `candidate-lifecycle.md`
|
||||
|
|
@ -1 +0,0 @@
|
|||
Deterministic patch values and test execution architecture
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent knows the specific deterministic patch values used in codeflash's pytest plugin and the subprocess-based test execution architecture.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Subprocess isolation",
|
||||
"description": "States that tests run in a subprocess to isolate the test environment from the main codeflash process, NOT in the same process",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "Fixed time value",
|
||||
"description": "References the specific fixed timestamp 1761717605.108106 for time.time() or the fixed datetime 2021-01-01 02:05:10 UTC for datetime.now()",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "Fixed UUID value",
|
||||
"description": "References the specific fixed UUID 12345678-1234-5678-9abc-123456789012 for uuid4/uuid1",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "Random seed",
|
||||
"description": "States that random is seeded with 42 (NOT a different seed value)",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "Plugin blocklists",
|
||||
"description": "Mentions that behavioral tests block specific pytest plugins (at least 2 of: benchmark, codspeed, xdist, sugar) to ensure deterministic execution",
|
||||
"max_score": 20
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Explain Test Reproducibility Guarantees
|
||||
|
||||
## Context
|
||||
|
||||
A codeflash user notices that their optimization candidate passes behavioral tests on one run but fails on the next. They suspect non-determinism in the test execution. They want to understand what guarantees codeflash provides for test reproducibility and how the system ensures consistent results.
|
||||
|
||||
## Task
|
||||
|
||||
Write a technical explanation of how codeflash ensures deterministic test execution. Cover the execution environment setup, what sources of non-determinism are controlled, and any specific values or configurations used. Also explain the test execution architecture.
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- A markdown file `test-reproducibility.md`
|
||||
|
|
@ -1 +0,0 @@
|
|||
Effort level configuration and candidate selection criteria
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent knows the specific effort level values for candidate generation and the criteria used to select the best optimization candidate.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Candidate counts by effort",
|
||||
"description": "States correct N_OPTIMIZER_CANDIDATES values: LOW=3, MEDIUM=5, HIGH=6 (at least 2 of 3 correct)",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Speedup as primary selector",
|
||||
"description": "States that the winning candidate is selected primarily by highest speedup ratio",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Diff length as tiebreaker",
|
||||
"description": "States that for tied speedups, shortest diff length from original is used as tiebreaker",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Refinement ranking weights",
|
||||
"description": "States that refinement candidates use weighted ranking with runtime weighted more heavily than diff (2:1 ratio or REFINED_CANDIDATE_RANKING_WEIGHTS=(2,1))",
|
||||
"max_score": 25
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,18 +0,0 @@
|
|||
# Design a Candidate Selection Dashboard
|
||||
|
||||
## Context
|
||||
|
||||
The codeflash team wants to build a dashboard that shows users how optimization candidates were evaluated and why a particular candidate won. The dashboard needs to display the selection process at each stage, from initial candidate pool through to the final winner.
|
||||
|
||||
## Task
|
||||
|
||||
Write a specification document for the dashboard that explains:
|
||||
1. How many candidates are generated at each effort level
|
||||
2. The exact criteria and order of operations used to pick the winning candidate
|
||||
3. How refinement candidates are ranked differently from initial candidates
|
||||
|
||||
Include concrete examples showing how two hypothetical candidates would be compared.
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- A markdown file `selection-dashboard-spec.md`
|
||||
|
|
@ -1 +0,0 @@
|
|||
Pipeline concurrency and FunctionToOptimize structure
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent knows the FunctionToOptimize data structure and the concurrent execution model for test generation and optimization.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "FunctionToOptimize fields",
|
||||
"description": "Includes at least 4 of: function_name, file_path, parents (list of FunctionParent), starting_line, ending_line, is_async, is_method, language",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Qualified name property",
|
||||
"description": "Mentions qualified_name as a property that produces the full dotted name including parent classes (e.g., MyClass.my_method)",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Concurrent execution",
|
||||
"description": "States that test generation and LLM optimization run concurrently (in parallel), NOT sequentially one after the other",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Entry point identification",
|
||||
"description": "Correctly identifies Optimizer.run() as the top-level entry point and FunctionOptimizer.optimize_function() as the per-function entry point",
|
||||
"max_score": 25
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
# Implement a Function Optimization Status Tracker
|
||||
|
||||
## Context
|
||||
|
||||
The codeflash team needs a status tracker that logs what happens to each function during an optimization run. For each function, it should record the function identity, which pipeline stages it passed through, and how long each stage took.
|
||||
|
||||
## Task
|
||||
|
||||
Write a design document explaining:
|
||||
1. What data structure represents a function being optimized, including its identity fields and how nested functions (methods inside classes) are represented
|
||||
2. The full name resolution strategy for identifying functions uniquely
|
||||
3. Which stages of the pipeline operate on a single function at a time vs. operating on multiple functions
|
||||
4. Where in the codebase the per-function optimization is orchestrated and what the top-level entry point is
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- A markdown file `status-tracker-design.md`
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"total_scenarios": 5,
|
||||
"capabilities_coverage": {
|
||||
"total_capabilities": 16,
|
||||
"capabilities_tested": 12,
|
||||
"coverage_percentage": 75.0
|
||||
},
|
||||
"complexity_distribution": {
|
||||
"basic": 1,
|
||||
"intermediate": 3,
|
||||
"advanced": 1
|
||||
},
|
||||
"scenarios": [
|
||||
{
|
||||
"index": 1,
|
||||
"capability": "code-strings-markdown-format, read-writable-vs-read-only",
|
||||
"complexity": "intermediate"
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"capability": "candidate-source-types, candidate-forest-dag, repair-request-structure",
|
||||
"complexity": "intermediate"
|
||||
},
|
||||
{
|
||||
"index": 3,
|
||||
"capability": "deterministic-patch-values, plugin-blocklists",
|
||||
"complexity": "advanced"
|
||||
},
|
||||
{
|
||||
"index": 4,
|
||||
"capability": "effort-level-values, best-candidate-selection",
|
||||
"complexity": "intermediate"
|
||||
},
|
||||
{
|
||||
"index": 5,
|
||||
"capability": "function-to-optimize-fields, concurrent-testgen-optimization, pipeline-stage-ordering",
|
||||
"complexity": "basic"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
{
|
||||
"total_infeasible": 4,
|
||||
"infeasible_capabilities": [
|
||||
{
|
||||
"capability": "ai-service-endpoints",
|
||||
"complexity": "intermediate",
|
||||
"reasoning": "Testing knowledge of specific API endpoints requires actual HTTP requests or mocking that bypasses the capability being tested"
|
||||
},
|
||||
{
|
||||
"capability": "context-token-limits",
|
||||
"complexity": "basic",
|
||||
"reasoning": "Already covered by the skills tile eval (scenario-1). Testing token counting requires the actual tokenizer library"
|
||||
},
|
||||
{
|
||||
"capability": "test-type-enum",
|
||||
"complexity": "basic",
|
||||
"reasoning": "Simple enum knowledge is better verified through skills that use test types rather than isolated recall"
|
||||
},
|
||||
{
|
||||
"capability": "result-type-usage",
|
||||
"complexity": "basic",
|
||||
"reasoning": "Already covered by the skills tile eval (scenario-2). Testing Result type usage is better done through implementation tasks"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
{
|
||||
"name": "codeflash/codeflash-docs",
|
||||
"version": "0.1.0",
|
||||
"summary": "Internal documentation for the codeflash optimization engine",
|
||||
"private": true,
|
||||
"docs": "docs/index.md"
|
||||
}
|
||||
|
|
@ -1,45 +0,0 @@
|
|||
# Architecture
|
||||
|
||||
```
|
||||
codeflash/
|
||||
├── main.py # CLI entry point
|
||||
├── cli_cmds/ # Command handling, console output (Rich)
|
||||
├── discovery/ # Find optimizable functions
|
||||
├── context/ # Extract code dependencies and imports
|
||||
├── optimization/ # Generate optimized code via AI
|
||||
│ ├── optimizer.py # Main optimization orchestration
|
||||
│ └── function_optimizer.py # Per-function optimization logic
|
||||
├── verification/ # Run deterministic tests (pytest plugin)
|
||||
├── benchmarking/ # Performance measurement
|
||||
├── github/ # PR creation
|
||||
├── api/ # AI service communication
|
||||
├── code_utils/ # Code parsing, git utilities
|
||||
├── models/ # Pydantic models and types
|
||||
├── languages/ # Multi-language support (Python, JavaScript/TypeScript)
|
||||
├── setup/ # Config schema, auto-detection, first-run experience
|
||||
├── picklepatch/ # Serialization/deserialization utilities
|
||||
├── tracing/ # Function call tracing
|
||||
├── tracer.py # Root-level tracer entry point for profiling
|
||||
├── lsp/ # IDE integration (Language Server Protocol)
|
||||
├── telemetry/ # Sentry, PostHog
|
||||
├── either.py # Functional Result type for error handling
|
||||
├── result/ # Result types and handling
|
||||
└── version.py # Version information
|
||||
```
|
||||
|
||||
## Key Entry Points
|
||||
|
||||
| Task | Start here |
|
||||
|------|------------|
|
||||
| CLI arguments & commands | `cli_cmds/cli.py` |
|
||||
| Optimization orchestration | `optimization/optimizer.py` → `Optimizer.run()` |
|
||||
| Per-function optimization | `optimization/function_optimizer.py` → `FunctionOptimizer` |
|
||||
| Function discovery | `discovery/functions_to_optimize.py` |
|
||||
| Context extraction | `context/code_context_extractor.py` |
|
||||
| Test execution | `verification/test_runner.py`, `verification/pytest_plugin.py` |
|
||||
| Performance ranking | `benchmarking/function_ranker.py` |
|
||||
| Domain types | `models/models.py`, `models/function_types.py` |
|
||||
| Result handling | `either.py` (`Result`, `Success`, `Failure`, `is_successful`) |
|
||||
| AI service communication | `api/aiservice.py` → `AiServiceClient` |
|
||||
| Configuration constants | `code_utils/config_consts.py` |
|
||||
| Language support | `languages/registry.py` → `get_language_support()` |
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
# Code Style
|
||||
|
||||
- **Line length**: 120 characters
|
||||
- **Python**: 3.9+ syntax (use `from __future__ import annotations` for type hints)
|
||||
- **Package management**: Always use `uv`, never `pip` — run commands via `uv run`
|
||||
- **Tooling**: Ruff for linting/formatting, mypy strict mode, prek for pre-commit checks (`uv run prek run`)
|
||||
- **Comments**: Minimal — only explain "why", not "what"
|
||||
- **Docstrings**: Do not add unless explicitly requested
|
||||
- **Naming**: NEVER use leading underscores (`_function_name`) — Python has no true private functions, use public names
|
||||
- **Paths**: Always use absolute `Path` objects, handle encoding explicitly (UTF-8)
|
||||
- **Source transforms**: Use `libcst` for code modification/transformation to preserve formatting; `ast` is acceptable for read-only analysis and parsing
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
# Git Conventions
|
||||
|
||||
- **Always create a new branch from `main`** — never commit directly to `main` or reuse an existing feature branch for unrelated changes
|
||||
- Use conventional commit format: `fix:`, `feat:`, `refactor:`, `docs:`, `test:`, `chore:`
|
||||
- Keep commits atomic — one logical change per commit
|
||||
- Commit message body should be concise (1-2 sentences max)
|
||||
- PR titles should also use conventional format
|
||||
- Branch naming: `cf-#-title` (lowercase, hyphenated) where `#` is the Linear issue number
|
||||
- If related to a Linear issue, include `CF-#` in the PR body
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
# Language Support Rules
|
||||
|
||||
- Current language is a module-level singleton in `languages/current.py` — use `set_current_language()` / `current_language()`, never pass language as a parameter through call chains
|
||||
- Use `get_language_support(identifier)` from `languages/registry.py` to get a `LanguageSupport` instance — accepts `Path`, `Language` enum, or string; never import language classes directly
|
||||
- New language support classes must use the `@register_language` decorator to register with the extension and language registries
|
||||
- `languages/__init__.py` uses `__getattr__` for lazy imports to avoid circular dependencies — follow this pattern when adding new exports
|
||||
- `is_javascript()` returns `True` for both JavaScript and TypeScript
|
||||
- Language modules are lazily imported on first `get_language_support()` call via `_ensure_languages_registered()` — the `@register_language` decorator fires on import and populates `_EXTENSION_REGISTRY` and `_LANGUAGE_REGISTRY`
|
||||
- `LanguageSupport` instances are cached in `_SUPPORT_CACHE` — use `clear_cache()` only in tests
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
# Optimization Pipeline Patterns
|
||||
|
||||
- All major operations return `Result[SuccessType, ErrorType]` — construct with `Success(value)` / `Failure(error)`, check with `is_successful()` before calling `unwrap()`
|
||||
- Code context has token limits (`OPTIMIZATION_CONTEXT_TOKEN_LIMIT=16000`, `TESTGEN_CONTEXT_TOKEN_LIMIT=16000` in `code_utils/config_consts.py`) — exceeding them rejects the function
|
||||
- `read_writable_code` (modifiable code) can span multiple files; `read_only_context_code` is reference-only dependency code
|
||||
- Code is serialized as markdown code blocks: `` ```language:filepath\ncode\n``` `` — see `CodeStringsMarkdown` in `models/models.py`
|
||||
- Candidates form a forest (DAG): refinements/repairs reference `parent_id` on previous candidates via `OptimizedCandidateSource` (OPTIMIZE, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE)
|
||||
- Test generation and optimization run concurrently — coordinate through `CandidateEvaluationContext`
|
||||
- Generated tests are instrumented with `codeflash_capture.py` to record return values and traces
|
||||
- Minimum improvement threshold is 5% (`MIN_IMPROVEMENT_THRESHOLD=0.05`) — candidates below this are rejected
|
||||
- Stability thresholds: `STABILITY_WINDOW_SIZE=0.35`, `STABILITY_CENTER_TOLERANCE=0.0025`, `STABILITY_SPREAD_TOLERANCE=0.0025`
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Testing Rules
|
||||
|
||||
- Code context extraction and replacement tests must assert full string equality — no substring matching
|
||||
- Use pytest's `tmp_path` fixture for temp directories (it's a `Path` object)
|
||||
- Write temp files inside `tmp_path`, never use `NamedTemporaryFile` (causes Windows file contention)
|
||||
- Always call `.resolve()` on Path objects to ensure absolute paths and resolve symlinks
|
||||
- Use `.as_posix()` when converting resolved paths to strings (normalizes to forward slashes)
|
||||
- Any new feature or bug fix that can be tested automatically must have test cases
|
||||
- If changes affect existing test expectations, update the tests accordingly — tests must always pass after changes
|
||||
- The pytest plugin patches `time`, `random`, `uuid`, `datetime`, `os.urandom`, and `numpy.random` for deterministic test execution — never assume real randomness or real time in verification tests
|
||||
- `conftest.py` uses an autouse fixture that calls `reset_current_language()` — tests always start with Python as the default language
|
||||
- Test types are defined by the `TestType` enum: `EXISTING_UNIT_TEST`, `INSPIRED_REGRESSION`, `GENERATED_REGRESSION`, `REPLAY_TEST`, `CONCOLIC_COVERAGE_TEST`, `INIT_STATE_TEST`
|
||||
- Verification runs tests in a subprocess using a custom pytest plugin (`verification/pytest_plugin.py`) — behavioral tests use blocklisted plugins (`benchmark`, `codspeed`, `xdist`, `sugar`), benchmarking tests additionally block `cov` and `profiling`
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"name": "codeflash/codeflash-rules",
|
||||
"version": "0.1.0",
|
||||
"summary": "Coding standards and conventions for the codeflash codebase",
|
||||
"private": true,
|
||||
"rules": {
|
||||
"code-style": {
|
||||
"rules": "rules/code-style.md"
|
||||
},
|
||||
"architecture": {
|
||||
"rules": "rules/architecture.md"
|
||||
},
|
||||
"optimization-patterns": {
|
||||
"rules": "rules/optimization-patterns.md"
|
||||
},
|
||||
"git-conventions": {
|
||||
"rules": "rules/git-conventions.md"
|
||||
},
|
||||
"testing-rules": {
|
||||
"rules": "rules/testing-rules.md"
|
||||
},
|
||||
"language-rules": {
|
||||
"rules": "rules/language-rules.md"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,104 +0,0 @@
|
|||
{
|
||||
"package_name": "codeflash-skills",
|
||||
"total_capabilities": 14,
|
||||
"capabilities": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "sequential-pipeline-debugging",
|
||||
"description": "Debug optimization failures by walking through pipeline stages sequentially and stopping at the first failure found",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["discovery", "ranking", "context", "AI service", "verification", "deduplication", "repair"]
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"name": "token-limit-awareness",
|
||||
"description": "Know that OPTIMIZATION_CONTEXT_TOKEN_LIMIT and TESTGEN_CONTEXT_TOKEN_LIMIT are both 16000 tokens and that exceeding them causes function rejection",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["OPTIMIZATION_CONTEXT_TOKEN_LIMIT", "TESTGEN_CONTEXT_TOKEN_LIMIT", "encoded_tokens_len()"]
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"name": "improvement-threshold",
|
||||
"description": "Know that MIN_IMPROVEMENT_THRESHOLD is 0.05 (5%) and candidates below this speedup are rejected",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["MIN_IMPROVEMENT_THRESHOLD", "STABILITY_WINDOW_SIZE"]
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"name": "ast-deduplication",
|
||||
"description": "Know that candidates are deduplicated via AST normalization using normalize_code() and CandidateEvaluationContext.ast_code_to_id",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["normalize_code()", "CandidateEvaluationContext.ast_code_to_id", "code_utils/deduplicate_code.py"]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"name": "repair-trigger-conditions",
|
||||
"description": "Know that repair only triggers when fewer than MIN_CORRECT_CANDIDATES=2 pass, and is skipped when REPAIR_UNMATCHED_PERCENTAGE_LIMIT is exceeded",
|
||||
"complexity": "advanced",
|
||||
"api_elements": ["MIN_CORRECT_CANDIDATES", "REPAIR_UNMATCHED_PERCENTAGE_LIMIT", "AIServiceCodeRepairRequest"]
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"name": "ai-service-error-patterns",
|
||||
"description": "Know specific log patterns to search for when AI service fails: 'Error generating optimized candidates', 'cli-optimize-error-caught', 'cli-optimize-error-response'",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["AiServiceClient", "api/aiservice.py"]
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"name": "behavioral-vs-benchmark-failures",
|
||||
"description": "Distinguish between behavioral test failures (return value/stdout/pass-fail mismatches via TestDiffScope) and benchmark failures (speedup below threshold)",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["TestDiffScope", "RETURN_VALUE", "STDOUT", "DID_PASS"]
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"name": "result-type-pattern",
|
||||
"description": "Use Result[L, R] from either.py with Success/Failure constructors and is_successful() checks before unwrap()",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["Result", "Success", "Failure", "is_successful", "unwrap()", "either.py"]
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"name": "effort-config-pattern",
|
||||
"description": "Add effort-dependent config via EffortKeys enum, EFFORT_VALUES dict with LOW/MEDIUM/HIGH levels, and get_effort_value()",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["EffortKeys", "EffortLevel", "EFFORT_VALUES", "get_effort_value()", "config_consts.py"]
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"name": "module-to-feature-mapping",
|
||||
"description": "Know which codeflash module to modify for different feature types (optimization/ for strategies, api/ for endpoints, languages/ for language support, etc.)",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["MODULE_REFERENCE.md"]
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"name": "domain-type-conventions",
|
||||
"description": "Use @dataclass(frozen=True) for immutable data, BaseModel for serializable models, and keep function_types.py dependency-free",
|
||||
"complexity": "intermediate",
|
||||
"api_elements": ["@dataclass(frozen=True)", "BaseModel", "models/models.py", "models/function_types.py"]
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"name": "test-patterns",
|
||||
"description": "Use tmp_path fixture, .resolve() on Paths, .as_posix() for string conversion, full string equality assertions, and awareness of deterministic patches",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["tmp_path", ".resolve()", ".as_posix()", "pytest_plugin.py"]
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"name": "quality-check-commands",
|
||||
"description": "Run uv run prek run for formatting/linting, uv run mypy for type checking, and uv run pytest for tests",
|
||||
"complexity": "basic",
|
||||
"api_elements": ["uv run prek run", "uv run mypy", "uv run pytest"]
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"name": "language-support-patterns",
|
||||
"description": "Use @register_language decorator, get_language_support() for lookup, singleton pattern via set_current_language()/current_language(), and is_python()/is_javascript() guards",
|
||||
"complexity": "advanced",
|
||||
"api_elements": ["@register_language", "get_language_support()", "set_current_language()", "is_python()", "is_javascript()"]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1 +0,0 @@
|
|||
Sequential pipeline debugging with specific thresholds
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent follows the sequential debugging workflow from the skill, checking pipeline stages in order and using correct threshold values when diagnosing an optimization that produced no results.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Sequential stage order",
|
||||
"description": "Investigates pipeline stages in order: discovery before ranking before context before AI service before test failures. Does NOT jump to later stages without checking earlier ones first.",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Token limit value",
|
||||
"description": "References the specific token limit of 16000 for OPTIMIZATION_CONTEXT_TOKEN_LIMIT or TESTGEN_CONTEXT_TOKEN_LIMIT when checking context extraction",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Importance threshold",
|
||||
"description": "References DEFAULT_IMPORTANCE_THRESHOLD=0.001 when checking function ranking",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Stops at failure",
|
||||
"description": "Identifies the failing stage and focuses investigation there rather than continuing through all remaining stages",
|
||||
"max_score": 25
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Diagnose Silent Optimization Skip
|
||||
|
||||
## Context
|
||||
|
||||
A user reports that when running codeflash on their project, a specific function `calculate_metrics` in `analytics/processor.py` never appears in the optimization results. The function exists in the module root, is not in the exclude list, and has not been previously optimized. Trace data shows the function is called frequently but with very short execution times (averaging 0.0005 seconds total addressable time). The function has moderate dependencies.
|
||||
|
||||
## Task
|
||||
|
||||
Write a diagnostic report explaining why this function is being skipped and at which stage in the pipeline the function is filtered out. Include the specific threshold or condition that causes the skip.
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
A markdown file `diagnostic-report.md` explaining the root cause.
|
||||
|
|
@ -1 +0,0 @@
|
|||
Result type pattern and effort-dependent configuration
|
||||
|
|
@ -1,31 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent uses the codeflash Result type pattern from either.py and the effort-dependent configuration pattern when implementing a new pipeline feature.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Imports from either.py",
|
||||
"description": "Imports Success, Failure, and is_successful from codeflash.either (NOT from a different error handling module)",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "Result return type",
|
||||
"description": "Function returns Result type using Success() for success and Failure() for errors, not exceptions or None",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "is_successful check",
|
||||
"description": "Calls is_successful() or .is_successful() before calling unwrap() on the result",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "EffortKeys enum entry",
|
||||
"description": "Adds a new entry to the EffortKeys enum in config_consts.py",
|
||||
"max_score": 20
|
||||
},
|
||||
{
|
||||
"name": "Three effort levels",
|
||||
"description": "Adds values for all three EffortLevel variants (LOW, MEDIUM, HIGH) in EFFORT_VALUES dict",
|
||||
"max_score": 20
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
# Add Candidate Timeout Feature
|
||||
|
||||
## Context
|
||||
|
||||
The codeflash optimization engine currently has no per-candidate timeout. Some candidates take too long during verification, wasting the optimization budget. A new feature is needed to skip candidates that exceed a configurable time limit during behavioral testing.
|
||||
|
||||
The timeout should vary based on the optimization effort setting — shorter timeouts for low effort runs (to save time) and longer for high effort runs (to allow more complex optimizations).
|
||||
|
||||
## Task
|
||||
|
||||
Implement a `check_candidate_timeout` function in `codeflash/optimization/function_optimizer.py` that:
|
||||
1. Takes a candidate runtime and returns whether the candidate should be skipped
|
||||
2. Uses a configurable timeout threshold that scales with optimization effort
|
||||
3. Handles the error case where the runtime measurement is unavailable
|
||||
|
||||
Also add the necessary configuration constant to `codeflash/code_utils/config_consts.py`.
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- Modified `function_optimizer.py` with the new function
|
||||
- Modified `config_consts.py` with the new configuration
|
||||
|
|
@ -1 +0,0 @@
|
|||
Test patterns and deterministic patch awareness
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent follows codeflash test conventions when writing tests, including path handling, temp directory patterns, and awareness of the deterministic patching system.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Uses tmp_path fixture",
|
||||
"description": "Test function uses pytest tmp_path fixture parameter, NOT tempfile.NamedTemporaryFile or tempfile.mkdtemp",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Calls resolve on paths",
|
||||
"description": "Calls .resolve() on Path objects before using them in assertions or function calls",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Full string equality",
|
||||
"description": "Uses exact equality assertions (== or assert_equal) for code string comparisons, NOT substring checks like 'in' or assertIn or contains",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "No real time dependency",
|
||||
"description": "Test does NOT depend on real time.time(), datetime.now(), random values, or uuid generation for correctness. Acknowledges or accounts for deterministic patches if time/random values are involved.",
|
||||
"max_score": 25
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
# Write Tests for Context Hash Comparison
|
||||
|
||||
## Context
|
||||
|
||||
The codeflash context extraction module has a function `compare_context_hashes(context_a, context_b)` that takes two `CodeOptimizationContext` objects and returns whether their hashing contexts are identical. This is used to detect when the same function has already been optimized.
|
||||
|
||||
```python
|
||||
# In codeflash/context/code_context_extractor.py
|
||||
def compare_context_hashes(context_a: CodeOptimizationContext, context_b: CodeOptimizationContext) -> bool:
|
||||
return context_a.hashing_code_context_hash == context_b.hashing_code_context_hash
|
||||
```
|
||||
|
||||
## Task
|
||||
|
||||
Write a test file `tests/test_context/test_hash_comparison.py` with tests for this function. Include tests for:
|
||||
1. Two contexts with identical code producing the same hash
|
||||
2. Two contexts with different code producing different hashes
|
||||
3. A context compared with itself
|
||||
|
||||
The tests should create temporary Python source files to build realistic context objects.
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- `tests/test_context/test_hash_comparison.py`
|
||||
|
|
@ -1 +0,0 @@
|
|||
Domain type conventions and module identification
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent follows codeflash domain type conventions and correctly identifies the right module when adding a new data type for the optimization pipeline.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "Placed in models/models.py",
|
||||
"description": "New data type is added to codeflash/models/models.py (NOT models/function_types.py, since it has dependencies on other codeflash modules)",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Uses frozen dataclass",
|
||||
"description": "Immutable data type uses @dataclass(frozen=True) decorator, NOT a regular class or unfrozen dataclass",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "BaseModel for serializable",
|
||||
"description": "If a serializable model is needed, uses Pydantic BaseModel (NOT dataclass or dict)",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Correct module for feature",
|
||||
"description": "Places the main logic in the correct module for the feature type (e.g., verification/ for test-related, optimization/ for candidate-related, api/ for service-related)",
|
||||
"max_score": 25
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
# Add Optimization Confidence Score
|
||||
|
||||
## Context
|
||||
|
||||
The codeflash team wants to add a confidence score to each optimization result. The score should capture how confident the system is that an optimization is both correct and beneficial. It combines test coverage percentage, number of passing test cases, and speedup stability into a single metric.
|
||||
|
||||
The score needs to be:
|
||||
- Attached to each candidate during evaluation (immutable once computed)
|
||||
- Included in the final PR report (needs JSON serialization)
|
||||
- Computed during the candidate evaluation phase
|
||||
|
||||
## Task
|
||||
|
||||
1. Define the data types needed for the confidence score
|
||||
2. Write a `compute_confidence_score` function that takes coverage percentage (float), passing test count (int), and stability ratio (float) and returns the confidence result
|
||||
3. Place all code in the appropriate codeflash modules
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
- New/modified type definitions in the appropriate models file
|
||||
- New function in the appropriate module
|
||||
|
|
@ -1 +0,0 @@
|
|||
Deduplication mechanics and repair trigger conditions
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"context": "Tests whether the agent understands codeflash's candidate deduplication via AST normalization and the specific conditions under which code repair is triggered vs skipped.",
|
||||
"type": "weighted_checklist",
|
||||
"checklist": [
|
||||
{
|
||||
"name": "AST normalization",
|
||||
"description": "Mentions that deduplication uses AST normalization (normalize_code from code_utils/deduplicate_code.py), NOT simple string comparison",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Duplicate result copying",
|
||||
"description": "Explains that duplicate candidates copy results from the first-seen candidate rather than being re-tested",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Repair trigger threshold",
|
||||
"description": "States that repair triggers when fewer than 2 candidates pass (MIN_CORRECT_CANDIDATES=2), NOT when zero candidates pass or when any candidate fails",
|
||||
"max_score": 25
|
||||
},
|
||||
{
|
||||
"name": "Unmatched percentage limit",
|
||||
"description": "Mentions REPAIR_UNMATCHED_PERCENTAGE_LIMIT as a condition that can cause repair to be skipped entirely, with effort-dependent values (0.2/0.3/0.4)",
|
||||
"max_score": 25
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
# Investigate Low Candidate Diversity
|
||||
|
||||
## Context
|
||||
|
||||
A codeflash user is optimizing a data processing function at medium effort level. The AI service returns 5 candidates, but the optimization log shows only 1 candidate was actually benchmarked. Of the 5 candidates, 1 passed behavioral tests but didn't meet the performance threshold. The user wants to understand what happened to the other 4 candidates and why no repair attempts were made.
|
||||
|
||||
## Task
|
||||
|
||||
Write an analysis document explaining:
|
||||
1. Why only 1 out of 5 candidates was benchmarked
|
||||
2. How the system determines which candidates to actually test
|
||||
3. Under what conditions the system would have attempted to repair the failing candidates
|
||||
4. What the user could change to get more diverse results
|
||||
|
||||
## Expected Outputs
|
||||
|
||||
A markdown file `analysis.md` with the explanation.
|
||||
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"total_scenarios": 5,
|
||||
"capabilities_coverage": {
|
||||
"total_capabilities": 14,
|
||||
"capabilities_tested": 10,
|
||||
"coverage_percentage": 71.4
|
||||
},
|
||||
"complexity_distribution": {
|
||||
"basic": 2,
|
||||
"intermediate": 2,
|
||||
"advanced": 1
|
||||
},
|
||||
"scenarios": [
|
||||
{
|
||||
"index": 1,
|
||||
"capability": "sequential-pipeline-debugging, token-limit-awareness, improvement-threshold",
|
||||
"complexity": "intermediate"
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"capability": "result-type-pattern, effort-config-pattern",
|
||||
"complexity": "intermediate"
|
||||
},
|
||||
{
|
||||
"index": 3,
|
||||
"capability": "test-patterns, quality-check-commands",
|
||||
"complexity": "basic"
|
||||
},
|
||||
{
|
||||
"index": 4,
|
||||
"capability": "domain-type-conventions, module-to-feature-mapping",
|
||||
"complexity": "basic"
|
||||
},
|
||||
{
|
||||
"index": 5,
|
||||
"capability": "ast-deduplication, repair-trigger-conditions",
|
||||
"complexity": "advanced"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,25 +0,0 @@
|
|||
{
|
||||
"total_infeasible": 4,
|
||||
"infeasible_capabilities": [
|
||||
{
|
||||
"capability": "ai-service-error-patterns",
|
||||
"complexity": "intermediate",
|
||||
"reasoning": "Requires actual AI service API responses and log output that cannot be meaningfully mocked without bypassing the capability being tested"
|
||||
},
|
||||
{
|
||||
"capability": "behavioral-vs-benchmark-failures",
|
||||
"complexity": "intermediate",
|
||||
"reasoning": "Requires actual test execution results with JUnit XML output and timing data that cannot be generated in a one-shot file-based eval"
|
||||
},
|
||||
{
|
||||
"capability": "language-support-patterns",
|
||||
"complexity": "advanced",
|
||||
"reasoning": "Requires the full language registry system with imports and decorators that would need the codeflash runtime to verify"
|
||||
},
|
||||
{
|
||||
"capability": "quality-check-commands",
|
||||
"complexity": "basic",
|
||||
"reasoning": "Requires running actual uv/prek/mypy commands which need the project environment and dependencies installed"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
# Module Reference
|
||||
|
||||
| Feature area | Primary module | Key files |
|
||||
|-------------|----------------|-----------|
|
||||
| New optimization strategy | `optimization/` | `function_optimizer.py`, `optimizer.py` |
|
||||
| New test type | `verification/`, `models/` | `test_runner.py`, `pytest_plugin.py`, `test_type.py` |
|
||||
| New AI service endpoint | `api/` | `aiservice.py` |
|
||||
| New language support | `languages/` | Create new `languages/<lang>/support.py` |
|
||||
| Context extraction change | `context/` | `code_context_extractor.py` |
|
||||
| New CLI command | `cli_cmds/` | `cli.py` |
|
||||
| New config option | `setup/`, `code_utils/` | `config_consts.py`, `setup/detector.py` |
|
||||
| Discovery filter | `discovery/` | `functions_to_optimize.py` |
|
||||
| PR/result changes | `github/`, `result/` | Relevant handlers |
|
||||
|
|
@ -1,146 +0,0 @@
|
|||
---
|
||||
name: add-codeflash-feature
|
||||
description: >
|
||||
Guides implementation of new functionality in the codeflash optimization engine.
|
||||
Use when adding a feature, building new functionality, implementing a new
|
||||
optimization strategy, adding a language backend, creating an API endpoint,
|
||||
extending the verification pipeline, or developing any new codeflash capability.
|
||||
Covers module identification, Result type patterns, config, types, tests, and
|
||||
quality checks.
|
||||
---
|
||||
|
||||
# Add Codeflash Feature
|
||||
|
||||
Use this workflow when implementing new functionality in the codeflash codebase — new optimization strategies, language backends, API endpoints, CLI commands, config options, or pipeline extensions.
|
||||
|
||||
## Step 1: Identify Target Modules
|
||||
|
||||
Determine which module(s) need modification. See [MODULE_REFERENCE.md](MODULE_REFERENCE.md) for the full mapping of feature areas to modules and key files.
|
||||
|
||||
**Checkpoint**: Read the target files and understand existing patterns before writing any code. Look for similar features already implemented as reference.
|
||||
|
||||
## Step 2: Follow Result Type Pattern
|
||||
|
||||
Use the `Result[L, R]` type from `either.py` for error handling in pipeline operations:
|
||||
|
||||
```python
|
||||
from codeflash.either import Success, Failure, is_successful
|
||||
|
||||
def my_operation() -> Result[str, MyResultType]:
|
||||
if error_condition:
|
||||
return Failure("descriptive error message")
|
||||
return Success(result_value)
|
||||
|
||||
# Usage:
|
||||
result = my_operation()
|
||||
if not is_successful(result):
|
||||
logger.error(result.failure())
|
||||
return
|
||||
value = result.unwrap()
|
||||
```
|
||||
|
||||
**Checkpoint**: Verify your function signatures match the `Result` pattern used in surrounding code. Not all functions use `Result` — match the convention of the module you're modifying.
|
||||
|
||||
## Step 3: Add Configuration Constants
|
||||
|
||||
If the feature needs configurable thresholds or limits:
|
||||
|
||||
1. Add constants to `code_utils/config_consts.py`
|
||||
2. If effort-dependent, add to `EFFORT_VALUES` dict with values for all three levels:
|
||||
```python
|
||||
# In config_consts.py:
|
||||
class EffortKeys(str, Enum):
|
||||
MY_NEW_KEY = "MY_NEW_KEY"
|
||||
|
||||
EFFORT_VALUES: dict[str, dict[EffortLevel, Any]] = {
|
||||
# ... existing entries ...
|
||||
EffortKeys.MY_NEW_KEY.value: {
|
||||
EffortLevel.LOW: 1,
|
||||
EffortLevel.MEDIUM: 3,
|
||||
EffortLevel.HIGH: 5,
|
||||
},
|
||||
}
|
||||
```
|
||||
3. Access via `get_effort_value(EffortKeys.MY_NEW_KEY, effort_level)`
|
||||
|
||||
**Checkpoint**: Skip this step if the feature doesn't need configuration. Not every feature requires new constants.
|
||||
|
||||
## Step 4: Add Domain Types
|
||||
|
||||
If new data structures are needed:
|
||||
|
||||
1. Add Pydantic models or frozen dataclasses to `models/models.py` or `models/function_types.py`
|
||||
2. Use `@dataclass(frozen=True)` for immutable data, `BaseModel` for models that need serialization
|
||||
3. Keep `function_types.py` dependency-free — no imports from other codeflash modules
|
||||
|
||||
Example following existing patterns:
|
||||
```python
|
||||
# In models/models.py:
|
||||
@dataclass(frozen=True)
|
||||
class MyNewType:
|
||||
name: str
|
||||
value: int
|
||||
source: OptimizedCandidateSource
|
||||
|
||||
# For serializable models:
|
||||
class MyNewModel(BaseModel):
|
||||
items: list[MyNewType] = []
|
||||
```
|
||||
|
||||
**Checkpoint**: Skip this step if you can reuse existing types. Check `models/models.py` for types that already fit your needs.
|
||||
|
||||
## Step 5: Write Tests
|
||||
|
||||
Follow existing test patterns:
|
||||
|
||||
1. Create test files in `tests/` mirroring the source structure (e.g., `tests/test_optimization/test_my_feature.py`)
|
||||
2. Use pytest's `tmp_path` fixture for temp directories — never `NamedTemporaryFile`
|
||||
3. Always call `.resolve()` on Path objects and `.as_posix()` for string conversion
|
||||
4. Assert full string equality for code context tests — no substring matching
|
||||
5. The pytest plugin patches `time`, `random`, `uuid`, `datetime` — never rely on real values in verification tests
|
||||
|
||||
```python
|
||||
def test_my_feature(tmp_path: Path) -> None:
|
||||
test_file = tmp_path / "test_module.py"
|
||||
test_file.write_text("def foo(): return 1", encoding="utf-8")
|
||||
result = my_operation(test_file.resolve())
|
||||
assert is_successful(result)
|
||||
assert result.unwrap() == expected_value
|
||||
```
|
||||
|
||||
**Checkpoint**: Run the new tests in isolation before proceeding: `uv run pytest tests/path/to/test_file.py -x`
|
||||
|
||||
## Step 6: Run Quality Checks
|
||||
|
||||
Run all validation before committing:
|
||||
|
||||
```bash
|
||||
# Pre-commit checks (ruff format + lint)
|
||||
uv run prek run
|
||||
|
||||
# Type checking
|
||||
uv run mypy codeflash/
|
||||
|
||||
# Run relevant tests
|
||||
uv run pytest tests/path/to/relevant/tests -x
|
||||
```
|
||||
|
||||
**If checks fail**:
|
||||
- `prek run` failures: Fix formatting/lint issues reported by ruff, then re-run
|
||||
- `mypy` failures: Fix type errors — common issues are missing return types, wrong `Optional` usage, or missing imports in `TYPE_CHECKING` block
|
||||
- Test failures: Fix the failing test or the implementation, then re-run
|
||||
|
||||
## Step 7: Language Support Considerations
|
||||
|
||||
If the feature needs to work across languages:
|
||||
|
||||
1. Use `get_language_support(identifier)` from `languages/registry.py` — never import language classes directly
|
||||
2. Current language is a singleton: `set_current_language()` / `current_language()` from `languages/current.py`
|
||||
3. Use `is_python()` / `is_javascript()` guards for language-specific branches
|
||||
4. New language support classes must use `@register_language` decorator and be instantiable without arguments
|
||||
|
||||
**Checkpoint**: Skip this step if the feature is Python-only. Most features don't need multi-language support.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If you run into issues, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common problems and fixes (circular imports, `UnsupportedLanguageError`, CI path failures, Pydantic validation errors, token limit exceeded).
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
# Troubleshooting
|
||||
|
||||
| Problem | Likely cause | Fix |
|
||||
|---------|-------------|-----|
|
||||
| Circular import at startup | Importing from `models/` in a module loaded early | Move import into `TYPE_CHECKING` block or use lazy import |
|
||||
| `UnsupportedLanguageError` | Language modules not registered yet | Call `_ensure_languages_registered()` or use `get_language_support()` which does it automatically |
|
||||
| Tests pass locally but fail in CI | Path differences (absolute vs relative) | Always use `.resolve()` on Path objects |
|
||||
| `ValidationError` from Pydantic | Invalid code passed to `CodeString` | Check that generated code passes syntax validation for the target language |
|
||||
| `encoded_tokens_len` exceeds limit | Context too large | Reduce helper functions or split into read-only vs read-writable |
|
||||
|
|
@ -1,124 +0,0 @@
|
|||
---
|
||||
name: debug-optimization-failure
|
||||
description: >
|
||||
Diagnose why a codeflash optimization produced no results or failed silently.
|
||||
Use when an optimization run errors out, returns no candidates, or all candidates
|
||||
are rejected. Walks through discovery, ranking, context limits, AI service,
|
||||
test verification, deduplication, and repair stages.
|
||||
---
|
||||
|
||||
# Debug Optimization Failure
|
||||
|
||||
Use this workflow when an optimization run fails or produces no results. Work through the stages sequentially — stop at the first failure found.
|
||||
|
||||
## Step 1: Check Function Discovery
|
||||
|
||||
Determine if the function was discovered by `FunctionVisitor`.
|
||||
|
||||
1. Search logs for the function name in discovery output:
|
||||
```python
|
||||
# In discovery/functions_to_optimize.py, FunctionVisitor filters out:
|
||||
# - Functions matching exclude patterns in pyproject.toml [tool.codeflash]
|
||||
# - Functions already optimized (was_function_previously_optimized())
|
||||
# - Functions outside the configured module-root
|
||||
```
|
||||
2. Verify the function file is under the configured `module-root` in `pyproject.toml`
|
||||
3. Check if the function was previously optimized — look for it in the optimization history
|
||||
|
||||
**Checkpoint**: If the function doesn't appear in discovery output, fix config patterns or file location before proceeding.
|
||||
|
||||
## Step 2: Check Ranking
|
||||
|
||||
If trace data is used, check if the function was ranked high enough.
|
||||
|
||||
1. Look at `benchmarking/function_ranker.py` output for the function's addressable time
|
||||
2. The function must exceed `DEFAULT_IMPORTANCE_THRESHOLD=0.001`:
|
||||
```python
|
||||
# Addressable time = own time + callee time / call count
|
||||
# Grep for the function in ranking output:
|
||||
# grep -i "function_name" in ranking logs
|
||||
```
|
||||
3. Functions below the threshold are silently skipped
|
||||
|
||||
**Checkpoint**: If ranked too low, the function doesn't spend enough time to be worth optimizing. No fix needed — this is expected.
|
||||
|
||||
## Step 3: Check Context Token Limits
|
||||
|
||||
Verify the function's context fits within token limits.
|
||||
|
||||
1. Check thresholds in `code_utils/config_consts.py`:
|
||||
```python
|
||||
OPTIMIZATION_CONTEXT_TOKEN_LIMIT = 16000 # tokens
|
||||
TESTGEN_CONTEXT_TOKEN_LIMIT = 16000 # tokens
|
||||
```
|
||||
2. Token counting uses `encoded_tokens_len()` from `code_utils/code_utils.py`
|
||||
3. Common causes: large helper function chains, deep dependency trees, large class hierarchies
|
||||
|
||||
**Checkpoint**: If context exceeds limits, the function is rejected. Consider refactoring to reduce dependencies or splitting large modules.
|
||||
|
||||
## Step 4: Check AI Service Response
|
||||
|
||||
Verify the AI service returned valid candidates.
|
||||
|
||||
1. Look for HTTP errors in logs:
|
||||
```
|
||||
# Error patterns to search for:
|
||||
"Error generating optimized candidates"
|
||||
"Error generating jit rewritten candidate"
|
||||
"cli-optimize-error-caught"
|
||||
"cli-optimize-error-response"
|
||||
```
|
||||
2. Check `_get_valid_candidates()` in `api/aiservice.py` — empty `code_strings` after `CodeStringsMarkdown.parse_markdown_code()` means the LLM returned malformed code blocks
|
||||
3. Verify API key is valid (`get_codeflash_api_key()`)
|
||||
|
||||
**Checkpoint**: If no candidates returned, check API key, network, and service status before proceeding.
|
||||
|
||||
## Step 5: Check Test Failures
|
||||
|
||||
Determine if candidates failed behavioral or benchmark tests.
|
||||
|
||||
1. **Behavioral failures** — compare return values, stdout, pass/fail between baseline and candidate:
|
||||
```python
|
||||
# TestDiffScope enum values to look for:
|
||||
# RETURN_VALUE - function returned different value
|
||||
# STDOUT - different stdout output
|
||||
# DID_PASS - test passed/failed differently
|
||||
```
|
||||
2. **Benchmark failures** — candidate must beat `MIN_IMPROVEMENT_THRESHOLD=0.05` (5% speedup)
|
||||
3. **Stability failures** — timing must be stable within `STABILITY_WINDOW_SIZE=0.35` (35% of iterations)
|
||||
4. Check JUnit XML test results in the temp directory for specific failure messages
|
||||
|
||||
**Checkpoint**: Behavioral failure = optimization changed behavior (check test diffs). Benchmark failure = not fast enough. Stability failure = noisy timing environment.
|
||||
|
||||
## Step 6: Check Deduplication
|
||||
|
||||
Verify candidates weren't deduplicated away.
|
||||
|
||||
1. `CandidateEvaluationContext.ast_code_to_id` tracks normalized AST → candidate mapping
|
||||
2. `normalize_code()` from `code_utils/deduplicate_code.py` strips comments/whitespace and normalizes the AST
|
||||
3. If all candidates normalize to identical code, only the first is tested — the rest copy its results
|
||||
|
||||
**Checkpoint**: If all duplicates, the LLM generated the same optimization repeatedly. Try a higher effort level for more diverse candidates.
|
||||
|
||||
## Step 7: Check Repair/Refinement
|
||||
|
||||
If initial candidates failed, check repair and refinement stages.
|
||||
|
||||
1. Repair only triggers if fewer than `MIN_CORRECT_CANDIDATES=2` passed behavioral tests
|
||||
2. Repair sends `AIServiceCodeRepairRequest` with `TestDiff` objects showing what went wrong
|
||||
3. Check `REPAIR_UNMATCHED_PERCENTAGE_LIMIT` (effort-dependent: 0.2/0.3/0.4) — if too many tests failed, repair is skipped entirely
|
||||
4. Refinement only runs on the top valid candidates (count depends on effort level)
|
||||
|
||||
**Checkpoint**: If repair also fails, the optimization approach likely doesn't work for this function. The function may rely on side effects or external state that the LLM can't safely optimize.
|
||||
|
||||
## Key Files Reference
|
||||
|
||||
| File | What to check |
|
||||
|------|---------------|
|
||||
| `optimization/function_optimizer.py` | Main loop, `determine_best_candidate()` |
|
||||
| `verification/test_runner.py` | Test subprocess execution |
|
||||
| `api/aiservice.py` | AI service requests/responses |
|
||||
| `code_utils/config_consts.py` | All thresholds and limits |
|
||||
| `context/code_context_extractor.py` | Context extraction and token counting |
|
||||
| `models/models.py` | `CandidateEvaluationContext`, `TestResults`, `TestDiff` |
|
||||
| `code_utils/deduplicate_code.py` | AST normalization for deduplication |
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
{
|
||||
"name": "codeflash/codeflash-skills",
|
||||
"version": "0.2.0",
|
||||
"summary": "Procedural workflows for developing and debugging codeflash",
|
||||
"private": true,
|
||||
"skills": {
|
||||
"debug-optimization-failure": {
|
||||
"path": "skills/debug-optimization-failure/SKILL.md"
|
||||
},
|
||||
"add-codeflash-feature": {
|
||||
"path": "skills/add-codeflash-feature/SKILL.md"
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in a new issue