perf: eager adjacency build, skip redundant file I/O, cache resolved paths
- Build CallGraph adjacency eagerly in __post_init__ instead of lazily, eliminating per-call None checks in callers_of/callees_of hot paths - Skip file read+hash in ensure_file_indexed/build_index when the file is already in the in-memory indexed_file_hashes cache - Cache Path.resolve() results in ReferenceGraph to avoid repeated filesystem syscalls for the same paths - Reuse callee_counts from rank_by_dependency_count in the optimizer loop instead of recomputing
This commit is contained in:
parent
01847f9acc
commit
ff7b93dbb2
3 changed files with 38 additions and 23 deletions
|
|
@ -200,6 +200,7 @@ class ReferenceGraph:
|
|||
self.conn = sqlite3.connect(str(db_path))
|
||||
self.conn.execute("PRAGMA journal_mode=WAL")
|
||||
self.indexed_file_hashes: dict[str, str] = {}
|
||||
self._resolved_paths: dict[Path, str] = {}
|
||||
self._init_schema()
|
||||
|
||||
def _init_schema(self) -> None:
|
||||
|
|
@ -260,6 +261,14 @@ class ReferenceGraph:
|
|||
)
|
||||
self.conn.commit()
|
||||
|
||||
def resolve_path(self, file_path: Path) -> str:
|
||||
cached = self._resolved_paths.get(file_path)
|
||||
if cached is not None:
|
||||
return cached
|
||||
resolved = str(file_path.resolve())
|
||||
self._resolved_paths[file_path] = resolved
|
||||
return resolved
|
||||
|
||||
def get_callees(
|
||||
self, file_path_to_qualified_names: dict[Path, set[str]]
|
||||
) -> tuple[dict[Path, set[FunctionSource]], list[FunctionSource]]:
|
||||
|
|
@ -273,7 +282,7 @@ class ReferenceGraph:
|
|||
) -> dict[tuple[Path, str], int]:
|
||||
all_caller_keys: list[tuple[Path, str, str]] = []
|
||||
for file_path, qualified_names in file_path_to_qualified_names.items():
|
||||
resolved = str(file_path.resolve())
|
||||
resolved = self.resolve_path(file_path)
|
||||
self.ensure_file_indexed(file_path, resolved)
|
||||
all_caller_keys.extend((file_path, resolved, qn) for qn in qualified_names)
|
||||
|
||||
|
|
@ -308,9 +317,13 @@ class ReferenceGraph:
|
|||
|
||||
def ensure_file_indexed(self, file_path: Path, resolved: str | None = None) -> IndexResult:
|
||||
if resolved is None:
|
||||
resolved = str(file_path.resolve())
|
||||
resolved = self.resolve_path(file_path)
|
||||
|
||||
# Always read and hash the file before checking the cache so we detect on-disk changes
|
||||
# Fast path: if already indexed this session, skip disk I/O
|
||||
if resolved in self.indexed_file_hashes:
|
||||
return IndexResult(file_path=file_path, cached=True, num_edges=0, edges=(), cross_file_edges=0, error=False)
|
||||
|
||||
# Read and hash the file to detect on-disk changes vs DB cache
|
||||
try:
|
||||
content = file_path.read_text(encoding="utf-8")
|
||||
except Exception:
|
||||
|
|
@ -325,7 +338,7 @@ class ReferenceGraph:
|
|||
|
||||
def index_file(self, file_path: Path, file_hash: str, resolved: str | None = None) -> IndexResult:
|
||||
if resolved is None:
|
||||
resolved = str(file_path.resolve())
|
||||
resolved = self.resolve_path(file_path)
|
||||
edges, had_error = _analyze_file(file_path, self.jedi_project, self.project_root_str)
|
||||
if had_error:
|
||||
logger.debug(f"ReferenceGraph: failed to parse {file_path}")
|
||||
|
|
@ -388,7 +401,17 @@ class ReferenceGraph:
|
|||
to_index: list[tuple[Path, str, str]] = []
|
||||
|
||||
for file_path in file_paths:
|
||||
resolved = str(file_path.resolve())
|
||||
resolved = self.resolve_path(file_path)
|
||||
|
||||
# Fast path: already indexed this session
|
||||
if resolved in self.indexed_file_hashes:
|
||||
self._report_progress(
|
||||
on_progress,
|
||||
IndexResult(
|
||||
file_path=file_path, cached=True, num_edges=0, edges=(), cross_file_edges=0, error=False
|
||||
),
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
content = file_path.read_text(encoding="utf-8")
|
||||
|
|
@ -403,7 +426,7 @@ class ReferenceGraph:
|
|||
|
||||
file_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()
|
||||
|
||||
# Check if already cached (in-memory or DB)
|
||||
# Check if cached in DB
|
||||
if self._is_file_cached(resolved, file_hash):
|
||||
self._report_progress(
|
||||
on_progress,
|
||||
|
|
@ -509,7 +532,7 @@ class ReferenceGraph:
|
|||
|
||||
all_caller_keys: list[tuple[Path, str, str]] = []
|
||||
for file_path, qualified_names in file_path_to_qualified_names.items():
|
||||
resolved = str(file_path.resolve())
|
||||
resolved = self.resolve_path(file_path)
|
||||
self.ensure_file_indexed(file_path, resolved)
|
||||
all_caller_keys.extend((file_path, resolved, qn) for qn in qualified_names)
|
||||
|
||||
|
|
|
|||
|
|
@ -39,11 +39,11 @@ class CallEdge:
|
|||
@dataclass
|
||||
class CallGraph:
|
||||
edges: list[CallEdge]
|
||||
_forward: dict[FunctionNode, list[CallEdge]] | None = field(default=None, init=False, repr=False)
|
||||
_reverse: dict[FunctionNode, list[CallEdge]] | None = field(default=None, init=False, repr=False)
|
||||
_nodes: set[FunctionNode] | None = field(default=None, init=False, repr=False)
|
||||
_forward: dict[FunctionNode, list[CallEdge]] = field(default_factory=dict, init=False, repr=False)
|
||||
_reverse: dict[FunctionNode, list[CallEdge]] = field(default_factory=dict, init=False, repr=False)
|
||||
_nodes: set[FunctionNode] = field(default_factory=set, init=False, repr=False)
|
||||
|
||||
def _build_adjacency(self) -> None:
|
||||
def __post_init__(self) -> None:
|
||||
fwd: dict[FunctionNode, list[CallEdge]] = {}
|
||||
rev: dict[FunctionNode, list[CallEdge]] = {}
|
||||
nodes: set[FunctionNode] = set()
|
||||
|
|
@ -58,23 +58,14 @@ class CallGraph:
|
|||
|
||||
@property
|
||||
def forward(self) -> dict[FunctionNode, list[CallEdge]]:
|
||||
if self._forward is None:
|
||||
self._build_adjacency()
|
||||
assert self._forward is not None
|
||||
return self._forward
|
||||
|
||||
@property
|
||||
def reverse(self) -> dict[FunctionNode, list[CallEdge]]:
|
||||
if self._reverse is None:
|
||||
self._build_adjacency()
|
||||
assert self._reverse is not None
|
||||
return self._reverse
|
||||
|
||||
@property
|
||||
def nodes(self) -> set[FunctionNode]:
|
||||
if self._nodes is None:
|
||||
self._build_adjacency()
|
||||
assert self._nodes is not None
|
||||
return self._nodes
|
||||
|
||||
def callees_of(self, node: FunctionNode) -> list[CallEdge]:
|
||||
|
|
|
|||
|
|
@ -422,6 +422,7 @@ class Optimizer:
|
|||
for file_path, func in all_functions:
|
||||
file_to_qns[file_path].add(func.qualified_name)
|
||||
callee_counts = call_graph.count_callees_per_function(dict(file_to_qns))
|
||||
self._cached_callee_counts = callee_counts
|
||||
|
||||
if function_to_tests:
|
||||
from codeflash.discovery.discover_unit_tests import existing_unit_test_count
|
||||
|
|
@ -526,9 +527,9 @@ class Optimizer:
|
|||
# Cache for module preparation (avoid re-parsing same files)
|
||||
prepared_modules: dict[Path, tuple[dict[Path, ValidCode], ast.Module | None]] = {}
|
||||
|
||||
# Build callee counts for per-function logging
|
||||
callee_counts: dict[tuple[Path, str], int] = {}
|
||||
if resolver is not None:
|
||||
# Reuse callee counts from rank_by_dependency_count if available, otherwise compute
|
||||
callee_counts: dict[tuple[Path, str], int] = getattr(self, "_cached_callee_counts", {})
|
||||
if not callee_counts and resolver is not None:
|
||||
file_to_qns: dict[Path, set[str]] = defaultdict(set)
|
||||
for fp, fn in globally_ranked_functions:
|
||||
file_to_qns[fp].add(fn.qualified_name)
|
||||
|
|
|
|||
Loading…
Reference in a new issue