perf: eager adjacency build, skip redundant file I/O, cache resolved paths

- Build CallGraph adjacency eagerly in __post_init__ instead of lazily,
  eliminating per-call None checks in callers_of/callees_of hot paths
- Skip file read+hash in ensure_file_indexed/build_index when the file
  is already in the in-memory indexed_file_hashes cache
- Cache Path.resolve() results in ReferenceGraph to avoid repeated
  filesystem syscalls for the same paths
- Reuse callee_counts from rank_by_dependency_count in the optimizer
  loop instead of recomputing
This commit is contained in:
Kevin Turcios 2026-03-14 21:04:26 -06:00
parent 01847f9acc
commit ff7b93dbb2
3 changed files with 38 additions and 23 deletions

View file

@ -200,6 +200,7 @@ class ReferenceGraph:
self.conn = sqlite3.connect(str(db_path))
self.conn.execute("PRAGMA journal_mode=WAL")
self.indexed_file_hashes: dict[str, str] = {}
self._resolved_paths: dict[Path, str] = {}
self._init_schema()
def _init_schema(self) -> None:
@ -260,6 +261,14 @@ class ReferenceGraph:
)
self.conn.commit()
def resolve_path(self, file_path: Path) -> str:
cached = self._resolved_paths.get(file_path)
if cached is not None:
return cached
resolved = str(file_path.resolve())
self._resolved_paths[file_path] = resolved
return resolved
def get_callees(
self, file_path_to_qualified_names: dict[Path, set[str]]
) -> tuple[dict[Path, set[FunctionSource]], list[FunctionSource]]:
@ -273,7 +282,7 @@ class ReferenceGraph:
) -> dict[tuple[Path, str], int]:
all_caller_keys: list[tuple[Path, str, str]] = []
for file_path, qualified_names in file_path_to_qualified_names.items():
resolved = str(file_path.resolve())
resolved = self.resolve_path(file_path)
self.ensure_file_indexed(file_path, resolved)
all_caller_keys.extend((file_path, resolved, qn) for qn in qualified_names)
@ -308,9 +317,13 @@ class ReferenceGraph:
def ensure_file_indexed(self, file_path: Path, resolved: str | None = None) -> IndexResult:
if resolved is None:
resolved = str(file_path.resolve())
resolved = self.resolve_path(file_path)
# Always read and hash the file before checking the cache so we detect on-disk changes
# Fast path: if already indexed this session, skip disk I/O
if resolved in self.indexed_file_hashes:
return IndexResult(file_path=file_path, cached=True, num_edges=0, edges=(), cross_file_edges=0, error=False)
# Read and hash the file to detect on-disk changes vs DB cache
try:
content = file_path.read_text(encoding="utf-8")
except Exception:
@ -325,7 +338,7 @@ class ReferenceGraph:
def index_file(self, file_path: Path, file_hash: str, resolved: str | None = None) -> IndexResult:
if resolved is None:
resolved = str(file_path.resolve())
resolved = self.resolve_path(file_path)
edges, had_error = _analyze_file(file_path, self.jedi_project, self.project_root_str)
if had_error:
logger.debug(f"ReferenceGraph: failed to parse {file_path}")
@ -388,7 +401,17 @@ class ReferenceGraph:
to_index: list[tuple[Path, str, str]] = []
for file_path in file_paths:
resolved = str(file_path.resolve())
resolved = self.resolve_path(file_path)
# Fast path: already indexed this session
if resolved in self.indexed_file_hashes:
self._report_progress(
on_progress,
IndexResult(
file_path=file_path, cached=True, num_edges=0, edges=(), cross_file_edges=0, error=False
),
)
continue
try:
content = file_path.read_text(encoding="utf-8")
@ -403,7 +426,7 @@ class ReferenceGraph:
file_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
# Check if already cached (in-memory or DB)
# Check if cached in DB
if self._is_file_cached(resolved, file_hash):
self._report_progress(
on_progress,
@ -509,7 +532,7 @@ class ReferenceGraph:
all_caller_keys: list[tuple[Path, str, str]] = []
for file_path, qualified_names in file_path_to_qualified_names.items():
resolved = str(file_path.resolve())
resolved = self.resolve_path(file_path)
self.ensure_file_indexed(file_path, resolved)
all_caller_keys.extend((file_path, resolved, qn) for qn in qualified_names)

View file

@ -39,11 +39,11 @@ class CallEdge:
@dataclass
class CallGraph:
edges: list[CallEdge]
_forward: dict[FunctionNode, list[CallEdge]] | None = field(default=None, init=False, repr=False)
_reverse: dict[FunctionNode, list[CallEdge]] | None = field(default=None, init=False, repr=False)
_nodes: set[FunctionNode] | None = field(default=None, init=False, repr=False)
_forward: dict[FunctionNode, list[CallEdge]] = field(default_factory=dict, init=False, repr=False)
_reverse: dict[FunctionNode, list[CallEdge]] = field(default_factory=dict, init=False, repr=False)
_nodes: set[FunctionNode] = field(default_factory=set, init=False, repr=False)
def _build_adjacency(self) -> None:
def __post_init__(self) -> None:
fwd: dict[FunctionNode, list[CallEdge]] = {}
rev: dict[FunctionNode, list[CallEdge]] = {}
nodes: set[FunctionNode] = set()
@ -58,23 +58,14 @@ class CallGraph:
@property
def forward(self) -> dict[FunctionNode, list[CallEdge]]:
if self._forward is None:
self._build_adjacency()
assert self._forward is not None
return self._forward
@property
def reverse(self) -> dict[FunctionNode, list[CallEdge]]:
if self._reverse is None:
self._build_adjacency()
assert self._reverse is not None
return self._reverse
@property
def nodes(self) -> set[FunctionNode]:
if self._nodes is None:
self._build_adjacency()
assert self._nodes is not None
return self._nodes
def callees_of(self, node: FunctionNode) -> list[CallEdge]:

View file

@ -422,6 +422,7 @@ class Optimizer:
for file_path, func in all_functions:
file_to_qns[file_path].add(func.qualified_name)
callee_counts = call_graph.count_callees_per_function(dict(file_to_qns))
self._cached_callee_counts = callee_counts
if function_to_tests:
from codeflash.discovery.discover_unit_tests import existing_unit_test_count
@ -526,9 +527,9 @@ class Optimizer:
# Cache for module preparation (avoid re-parsing same files)
prepared_modules: dict[Path, tuple[dict[Path, ValidCode], ast.Module | None]] = {}
# Build callee counts for per-function logging
callee_counts: dict[tuple[Path, str], int] = {}
if resolver is not None:
# Reuse callee counts from rank_by_dependency_count if available, otherwise compute
callee_counts: dict[tuple[Path, str], int] = getattr(self, "_cached_callee_counts", {})
if not callee_counts and resolver is not None:
file_to_qns: dict[Path, set[str]] = defaultdict(set)
for fp, fn in globally_ranked_functions:
file_to_qns[fp].add(fn.qualified_name)