perf: eager adjacency build, skip redundant file I/O, cache resolved paths

- Build CallGraph adjacency eagerly in __post_init__ instead of lazily, eliminating per-call None checks in callers_of/callees_of hot paths - Skip file read+hash in ensure_file_indexed/build_index when the file is already in the in-memory indexed_file_hashes cache - Cache Path.resolve() results in ReferenceGraph to avoid repeated filesystem syscalls for the same paths - Reuse callee_counts from rank_by_dependency_count in the optimizer loop instead of recomputing
2026-03-14 21:04:26 -06:00 · 2026-03-14 21:04:26 -06:00 · ff7b93dbb2
commit ff7b93dbb2
parent 01847f9acc
3 changed files with 38 additions and 23 deletions
--- a/codeflash/languages/python/reference_graph.py
+++ b/codeflash/languages/python/reference_graph.py
@ -200,6 +200,7 @@ class ReferenceGraph:
        self.conn = sqlite3.connect(str(db_path))
        self.conn.execute("PRAGMA journal_mode=WAL")
        self.indexed_file_hashes: dict[str, str] = {}
+        self._resolved_paths: dict[Path, str] = {}
        self._init_schema()

    def _init_schema(self) -> None:
@ -260,6 +261,14 @@ class ReferenceGraph:
        )
        self.conn.commit()

+    def resolve_path(self, file_path: Path) -> str:
+        cached = self._resolved_paths.get(file_path)
+        if cached is not None:
+            return cached
+        resolved = str(file_path.resolve())
+        self._resolved_paths[file_path] = resolved
+        return resolved
+
    def get_callees(
        self, file_path_to_qualified_names: dict[Path, set[str]]
    ) -> tuple[dict[Path, set[FunctionSource]], list[FunctionSource]]:
@ -273,7 +282,7 @@ class ReferenceGraph:
    ) -> dict[tuple[Path, str], int]:
        all_caller_keys: list[tuple[Path, str, str]] = []
        for file_path, qualified_names in file_path_to_qualified_names.items():
-            resolved = str(file_path.resolve())
+            resolved = self.resolve_path(file_path)
            self.ensure_file_indexed(file_path, resolved)
            all_caller_keys.extend((file_path, resolved, qn) for qn in qualified_names)

@ -308,9 +317,13 @@ class ReferenceGraph:

    def ensure_file_indexed(self, file_path: Path, resolved: str | None = None) -> IndexResult:
        if resolved is None:
-            resolved = str(file_path.resolve())
+            resolved = self.resolve_path(file_path)

-        # Always read and hash the file before checking the cache so we detect on-disk changes
+        # Fast path: if already indexed this session, skip disk I/O
+        if resolved in self.indexed_file_hashes:
+            return IndexResult(file_path=file_path, cached=True, num_edges=0, edges=(), cross_file_edges=0, error=False)
+
+        # Read and hash the file to detect on-disk changes vs DB cache
        try:
            content = file_path.read_text(encoding="utf-8")
        except Exception:
@ -325,7 +338,7 @@ class ReferenceGraph:

    def index_file(self, file_path: Path, file_hash: str, resolved: str | None = None) -> IndexResult:
        if resolved is None:
-            resolved = str(file_path.resolve())
+            resolved = self.resolve_path(file_path)
        edges, had_error = _analyze_file(file_path, self.jedi_project, self.project_root_str)
        if had_error:
            logger.debug(f"ReferenceGraph: failed to parse {file_path}")
@ -388,7 +401,17 @@ class ReferenceGraph:
        to_index: list[tuple[Path, str, str]] = []

        for file_path in file_paths:
-            resolved = str(file_path.resolve())
+            resolved = self.resolve_path(file_path)
+
+            # Fast path: already indexed this session
+            if resolved in self.indexed_file_hashes:
+                self._report_progress(
+                    on_progress,
+                    IndexResult(
+                        file_path=file_path, cached=True, num_edges=0, edges=(), cross_file_edges=0, error=False
+                    ),
+                )
+                continue

            try:
                content = file_path.read_text(encoding="utf-8")
@ -403,7 +426,7 @@ class ReferenceGraph:

            file_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()

-            # Check if already cached (in-memory or DB)
+            # Check if cached in DB
            if self._is_file_cached(resolved, file_hash):
                self._report_progress(
                    on_progress,
@ -509,7 +532,7 @@ class ReferenceGraph:

        all_caller_keys: list[tuple[Path, str, str]] = []
        for file_path, qualified_names in file_path_to_qualified_names.items():
-            resolved = str(file_path.resolve())
+            resolved = self.resolve_path(file_path)
            self.ensure_file_indexed(file_path, resolved)
            all_caller_keys.extend((file_path, resolved, qn) for qn in qualified_names)

--- a/codeflash/models/call_graph.py
+++ b/codeflash/models/call_graph.py
@ -39,11 +39,11 @@ class CallEdge:
@dataclass
 class CallGraph:
    edges: list[CallEdge]
-    _forward: dict[FunctionNode, list[CallEdge]] | None = field(default=None, init=False, repr=False)
-    _reverse: dict[FunctionNode, list[CallEdge]] | None = field(default=None, init=False, repr=False)
-    _nodes: set[FunctionNode] | None = field(default=None, init=False, repr=False)
+    _forward: dict[FunctionNode, list[CallEdge]] = field(default_factory=dict, init=False, repr=False)
+    _reverse: dict[FunctionNode, list[CallEdge]] = field(default_factory=dict, init=False, repr=False)
+    _nodes: set[FunctionNode] = field(default_factory=set, init=False, repr=False)

-    def _build_adjacency(self) -> None:
+    def __post_init__(self) -> None:
        fwd: dict[FunctionNode, list[CallEdge]] = {}
        rev: dict[FunctionNode, list[CallEdge]] = {}
        nodes: set[FunctionNode] = set()
@ -58,23 +58,14 @@ class CallGraph:

    @property
    def forward(self) -> dict[FunctionNode, list[CallEdge]]:
-        if self._forward is None:
-            self._build_adjacency()
-        assert self._forward is not None
        return self._forward

    @property
    def reverse(self) -> dict[FunctionNode, list[CallEdge]]:
-        if self._reverse is None:
-            self._build_adjacency()
-        assert self._reverse is not None
        return self._reverse

    @property
    def nodes(self) -> set[FunctionNode]:
-        if self._nodes is None:
-            self._build_adjacency()
-        assert self._nodes is not None
        return self._nodes

    def callees_of(self, node: FunctionNode) -> list[CallEdge]:
--- a/codeflash/optimization/optimizer.py
+++ b/codeflash/optimization/optimizer.py
@ -422,6 +422,7 @@ class Optimizer:
        for file_path, func in all_functions:
            file_to_qns[file_path].add(func.qualified_name)
        callee_counts = call_graph.count_callees_per_function(dict(file_to_qns))
+        self._cached_callee_counts = callee_counts

        if function_to_tests:
            from codeflash.discovery.discover_unit_tests import existing_unit_test_count
@ -526,9 +527,9 @@ class Optimizer:
            # Cache for module preparation (avoid re-parsing same files)
            prepared_modules: dict[Path, tuple[dict[Path, ValidCode], ast.Module | None]] = {}

-            # Build callee counts for per-function logging
-            callee_counts: dict[tuple[Path, str], int] = {}
-            if resolver is not None:
+            # Reuse callee counts from rank_by_dependency_count if available, otherwise compute
+            callee_counts: dict[tuple[Path, str], int] = getattr(self, "_cached_callee_counts", {})
+            if not callee_counts and resolver is not None:
                file_to_qns: dict[Path, set[str]] = defaultdict(set)
                for fp, fn in globally_ranked_functions:
                    file_to_qns[fp].add(fn.qualified_name)