mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
Improve deep optimizer: profiling script + failure modes + dist fix (#24)
* Exclude dev docs from plugin dist builds README.md, ARCHITECTURE.md, and ROADMAP.md are development docs that shouldn't ship in the assembled plugin distributions. * Improve deep optimizer: fix profiling script, add failure mode awareness Profiling script: Accept source root and command as CLI args instead of hardcoding `src` and requiring manual `# === RUN TARGET HERE ===` edits. The agent now copies the script from references and runs it with the project's actual source root and test command. Failure modes: Wire failure-modes.md into the on-demand reference table and stuck recovery checklist so the agent consults it when workflows break (deadlocks, silent failures, context loss, stale results). * Fix ruff lint errors in unified profiling script Refactor main() into parse_args(), profile_command(), and report_results() to fix C901 (complexity) and PLR0915 (too many statements). Also fix S306 (mktemp → NamedTemporaryFile), PLW1510 (explicit check=False), and add noqa for intentional os.path usage (PTH112) and subprocess with CLI args (S603).
This commit is contained in:
parent
20f6c59f05
commit
7e00007569
3 changed files with 149 additions and 95 deletions
2
Makefile
2
Makefile
|
|
@ -12,7 +12,7 @@ LANGS := $(notdir $(wildcard plugin/languages/*))
|
||||||
build: clean
|
build: clean
|
||||||
@for lang in $(LANGS); do \
|
@for lang in $(LANGS); do \
|
||||||
echo "Assembling plugin ($$lang) → dist-$$lang/"; \
|
echo "Assembling plugin ($$lang) → dist-$$lang/"; \
|
||||||
rsync -a --exclude='languages/' plugin/ dist-$$lang/; \
|
rsync -a --exclude='languages/' --exclude='README.md' --exclude='ARCHITECTURE.md' --exclude='ROADMAP.md' plugin/ dist-$$lang/; \
|
||||||
rsync -a plugin/languages/$$lang/agents/ dist-$$lang/agents/; \
|
rsync -a plugin/languages/$$lang/agents/ dist-$$lang/agents/; \
|
||||||
rsync -a plugin/languages/$$lang/references/ dist-$$lang/references/; \
|
rsync -a plugin/languages/$$lang/references/ dist-$$lang/references/; \
|
||||||
rsync -a plugin/languages/$$lang/skills/ dist-$$lang/skills/; \
|
rsync -a plugin/languages/$$lang/skills/ dist-$$lang/skills/; \
|
||||||
|
|
|
||||||
|
|
@ -166,60 +166,33 @@ You MUST profile before making any code changes. The unified profiling script be
|
||||||
|
|
||||||
### Unified CPU + Memory profiling (MANDATORY first step)
|
### Unified CPU + Memory profiling (MANDATORY first step)
|
||||||
|
|
||||||
This gives you the cross-domain view that single-domain agents lack.
|
This gives you the cross-domain view that single-domain agents lack. The script lives at `${CLAUDE_PLUGIN_ROOT}/languages/python/references/unified-profiling-script.py` — copy it to `/tmp/deep_profile.py` and run it.
|
||||||
|
|
||||||
```python
|
```bash
|
||||||
# /tmp/deep_profile.py
|
cp "${CLAUDE_PLUGIN_ROOT}/languages/python/references/unified-profiling-script.py" /tmp/deep_profile.py
|
||||||
import cProfile, tracemalloc, gc, time, pstats, os, sys
|
|
||||||
|
|
||||||
# Track GC to quantify allocation→CPU interaction
|
|
||||||
gc_times = []
|
|
||||||
def gc_callback(phase, info):
|
|
||||||
if phase == 'start':
|
|
||||||
gc_callback._start = time.perf_counter()
|
|
||||||
elif phase == 'stop':
|
|
||||||
gc_times.append(time.perf_counter() - gc_callback._start)
|
|
||||||
gc.callbacks.append(gc_callback)
|
|
||||||
|
|
||||||
tracemalloc.start()
|
|
||||||
profiler = cProfile.Profile()
|
|
||||||
|
|
||||||
profiler.enable()
|
|
||||||
# === RUN TARGET HERE ===
|
|
||||||
profiler.disable()
|
|
||||||
|
|
||||||
mem_snapshot = tracemalloc.take_snapshot()
|
|
||||||
profiler.dump_stats('/tmp/deep_cpu.prof')
|
|
||||||
|
|
||||||
# Memory top allocators
|
|
||||||
print("=== MEMORY: Top allocators ===")
|
|
||||||
for stat in mem_snapshot.statistics('lineno')[:15]:
|
|
||||||
print(stat)
|
|
||||||
|
|
||||||
# GC impact
|
|
||||||
total_gc = sum(gc_times)
|
|
||||||
print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
|
|
||||||
|
|
||||||
# CPU top functions (project-only)
|
|
||||||
print("\n=== CPU: Top project functions ===")
|
|
||||||
p = pstats.Stats('/tmp/deep_cpu.prof')
|
|
||||||
stats = p.stats
|
|
||||||
src = os.path.abspath('src') # adjust to project source root
|
|
||||||
project_funcs = []
|
|
||||||
for (file, line, name), (cc, nc, tt, ct, callers) in stats.items():
|
|
||||||
if not os.path.abspath(file).startswith(src):
|
|
||||||
continue
|
|
||||||
project_funcs.append((ct, tt, name, file, line))
|
|
||||||
project_funcs.sort(reverse=True)
|
|
||||||
total = project_funcs[0][0] if project_funcs else 1
|
|
||||||
if not os.path.exists('/tmp/deep_baseline_total'):
|
|
||||||
with open('/tmp/deep_baseline_total', 'w') as f:
|
|
||||||
f.write(str(total))
|
|
||||||
for ct, tt, name, file, line in project_funcs[:15]:
|
|
||||||
pct = ct / total * 100
|
|
||||||
print(f" {name:30s} — {pct:5.1f}% cumtime, {tt:.3f}s self")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
**Usage:** `$RUNNER /tmp/deep_profile.py <source_root> -- <command> [args...]`
|
||||||
|
|
||||||
|
- `<source_root>` — directory containing project source. Only functions under this path appear in the CPU report. Read this from `.codeflash/setup.md` (the package directory name — e.g., `src`, `mypackage`, or `.` to include everything).
|
||||||
|
- Everything after `--` is the command to profile.
|
||||||
|
|
||||||
|
**Examples:**
|
||||||
|
```bash
|
||||||
|
# Profile a specific test
|
||||||
|
$RUNNER /tmp/deep_profile.py src -- pytest tests/test_pipeline.py -x
|
||||||
|
|
||||||
|
# Profile a benchmark script
|
||||||
|
$RUNNER /tmp/deep_profile.py mypackage -- python scripts/benchmark.py
|
||||||
|
|
||||||
|
# Profile an import + function call
|
||||||
|
$RUNNER /tmp/deep_profile.py . -- python -c "from mypackage import run; run()"
|
||||||
|
```
|
||||||
|
|
||||||
|
The script reports: top memory allocators (tracemalloc), GC collection count and total time, and top project functions by cumtime with call counts and file locations. On the first run it records a baseline total; subsequent runs print the delta percentage.
|
||||||
|
|
||||||
|
**Choosing what to profile:** Use the test or benchmark that exercises the code path the user cares about. If the user said "make X faster", profile whatever runs X. If they gave a general request, use the project's test suite or a representative benchmark. Do NOT profile `import mypackage` unless the user specifically asked about import/startup time.
|
||||||
|
|
||||||
### Building the unified target table
|
### Building the unified target table
|
||||||
|
|
||||||
After the unified profile, cross-reference CPU hotspots with memory allocators to identify multi-domain targets:
|
After the unified profile, cross-reference CPU hotspots with memory allocators to identify multi-domain targets:
|
||||||
|
|
@ -305,6 +278,7 @@ When you encounter a domain-specific pattern, consult the domain reference for t
|
||||||
| Sequential awaits, blocking calls, async patterns | `../references/async/guide.md` |
|
| Sequential awaits, blocking calls, async patterns | `../references/async/guide.md` |
|
||||||
| Import time, circular deps, module structure | `../references/structure/guide.md` |
|
| Import time, circular deps, module structure | `../references/structure/guide.md` |
|
||||||
| After KEEP, authoritative e2e measurement | `${CLAUDE_PLUGIN_ROOT}/references/shared/e2e-benchmarks.md` |
|
| After KEEP, authoritative e2e measurement | `${CLAUDE_PLUGIN_ROOT}/references/shared/e2e-benchmarks.md` |
|
||||||
|
| Stuck, teammates stalled, context lost, workflow broken | `${CLAUDE_PLUGIN_ROOT}/references/shared/failure-modes.md` |
|
||||||
|
|
||||||
**Read on demand, not upfront.** Only load a reference when you've identified a concrete pattern through profiling. This keeps your context focused.
|
**Read on demand, not upfront.** Only load a reference when you've identified a concrete pattern through profiling. This keeps your context focused.
|
||||||
|
|
||||||
|
|
@ -527,6 +501,7 @@ If 5+ consecutive discards across all dimensions and strategies:
|
||||||
4. **Try the opposite.** If fine-grained fixes keep failing, try a coarser architectural change that spans domains.
|
4. **Try the opposite.** If fine-grained fixes keep failing, try a coarser architectural change that spans domains.
|
||||||
5. **Check for missed interactions.** Run gc.callbacks if you haven't — the GC→CPU interaction is the most commonly missed.
|
5. **Check for missed interactions.** Run gc.callbacks if you haven't — the GC→CPU interaction is the most commonly missed.
|
||||||
6. **Re-read original goal.** Has the focus drifted?
|
6. **Re-read original goal.** Has the focus drifted?
|
||||||
|
7. **Consult failure modes.** Read `${CLAUDE_PLUGIN_ROOT}/references/shared/failure-modes.md` for known workflow failure patterns — deadlocks, silent teammate failures, context loss after compaction, stale results, and ambiguous completion criteria. These are structural problems that look like being stuck but have specific recovery procedures.
|
||||||
|
|
||||||
If still stuck after 3 more experiments, **stop and report** with a comprehensive cross-domain analysis of why the code is at its floor.
|
If still stuck after 3 more experiments, **stop and report** with a comprehensive cross-domain analysis of why the code is at its floor.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,65 +1,144 @@
|
||||||
# /tmp/deep_profile.py
|
|
||||||
# Unified CPU + Memory + GC profiling script for the primary optimizer.
|
# Unified CPU + Memory + GC profiling script for the primary optimizer.
|
||||||
# This is the MANDATORY first step — gives the cross-domain view that
|
# This is the MANDATORY first step — gives the cross-domain view that
|
||||||
# single-domain agents lack.
|
# single-domain agents lack.
|
||||||
#
|
#
|
||||||
# Usage: Adapt the "RUN TARGET HERE" section for your test/benchmark,
|
# Usage:
|
||||||
# then run with: $RUNNER /tmp/deep_profile.py
|
# $RUNNER /tmp/deep_profile.py <source_root> -- <command> [args...]
|
||||||
|
#
|
||||||
|
# Examples:
|
||||||
|
# $RUNNER /tmp/deep_profile.py src -- pytest tests/test_pipeline.py -x
|
||||||
|
# $RUNNER /tmp/deep_profile.py mypackage -- python -c "from mypackage import run; run()"
|
||||||
|
# $RUNNER /tmp/deep_profile.py . -- python scripts/benchmark.py
|
||||||
|
#
|
||||||
|
# <source_root> is the directory containing project source code. Only
|
||||||
|
# functions under this path appear in the CPU report. Read this from
|
||||||
|
# .codeflash/setup.md ("Project root" + the package directory), or pass
|
||||||
|
# "." to include everything.
|
||||||
|
#
|
||||||
|
# Everything after "--" is the command to profile. The script runs it as
|
||||||
|
# a subprocess under cProfile + tracemalloc + GC tracking.
|
||||||
|
|
||||||
import cProfile
|
import cProfile
|
||||||
import gc
|
import gc
|
||||||
import os
|
import os
|
||||||
import pstats
|
import pstats
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import tracemalloc
|
import tracemalloc
|
||||||
|
|
||||||
# Track GC to quantify allocation→CPU interaction
|
BASELINE_PATH = "/tmp/deep_baseline_total"
|
||||||
gc_times = []
|
|
||||||
|
|
||||||
|
|
||||||
def gc_callback(phase, info):
|
def parse_args(args):
|
||||||
if phase == "start":
|
"""Parse <source_root> -- <command...> from argv."""
|
||||||
gc_callback._start = time.perf_counter()
|
if "--" not in args or len(args) < 3: # noqa: PLR2004
|
||||||
elif phase == "stop":
|
print(
|
||||||
gc_times.append(time.perf_counter() - gc_callback._start)
|
"Usage: python deep_profile.py <source_root> -- <command> [args...]",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"Example: python deep_profile.py src -- pytest tests/ -x",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
sep = args.index("--")
|
||||||
|
source_root = os.path.abspath(args[sep - 1])
|
||||||
|
cmd = args[sep + 1 :]
|
||||||
|
|
||||||
|
if not os.path.isdir(source_root): # noqa: PTH112
|
||||||
|
print(
|
||||||
|
f"Error: source root '{source_root}' is not a directory",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return source_root, cmd
|
||||||
|
|
||||||
|
|
||||||
gc.callbacks.append(gc_callback)
|
def profile_command(cmd):
|
||||||
|
"""Run cmd under cProfile + tracemalloc + GC tracking."""
|
||||||
|
gc_times = []
|
||||||
|
|
||||||
tracemalloc.start()
|
def gc_callback(phase, _info):
|
||||||
profiler = cProfile.Profile()
|
if phase == "start":
|
||||||
|
gc_callback._start = time.perf_counter()
|
||||||
|
elif phase == "stop":
|
||||||
|
gc_times.append(time.perf_counter() - gc_callback._start)
|
||||||
|
|
||||||
profiler.enable()
|
gc.callbacks.append(gc_callback)
|
||||||
# === RUN TARGET HERE ===
|
tracemalloc.start()
|
||||||
profiler.disable()
|
profiler = cProfile.Profile()
|
||||||
|
|
||||||
mem_snapshot = tracemalloc.take_snapshot()
|
profiler.enable()
|
||||||
profiler.dump_stats("/tmp/deep_cpu.prof")
|
result = subprocess.run(cmd, check=False) # noqa: S603
|
||||||
|
profiler.disable()
|
||||||
|
|
||||||
# Memory top allocators
|
mem_snapshot = tracemalloc.take_snapshot()
|
||||||
print("=== MEMORY: Top allocators ===")
|
with tempfile.NamedTemporaryFile(
|
||||||
for stat in mem_snapshot.statistics("lineno")[:15]:
|
suffix=".prof", prefix="deep_cpu_", delete=False
|
||||||
print(stat)
|
) as prof_file:
|
||||||
|
prof_path = prof_file.name
|
||||||
|
profiler.dump_stats(prof_path)
|
||||||
|
|
||||||
# GC impact
|
return result, mem_snapshot, gc_times, prof_path
|
||||||
total_gc = sum(gc_times)
|
|
||||||
print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
|
|
||||||
|
|
||||||
# CPU top functions (project-only)
|
|
||||||
print("\n=== CPU: Top project functions ===")
|
def report_results(source_root, mem_snapshot, gc_times, prof_path):
|
||||||
p = pstats.Stats("/tmp/deep_cpu.prof")
|
"""Print unified profile report."""
|
||||||
stats = p.stats
|
print("\n" + "=" * 60)
|
||||||
src = os.path.abspath("src") # adjust to project source root
|
print("UNIFIED PROFILE RESULTS")
|
||||||
project_funcs = []
|
print("=" * 60)
|
||||||
for (file, line, name), (cc, nc, tt, ct, callers) in stats.items():
|
|
||||||
if not os.path.abspath(file).startswith(src):
|
# Memory top allocators
|
||||||
continue
|
print("\n=== MEMORY: Top allocators ===")
|
||||||
project_funcs.append((ct, tt, name, file, line))
|
for stat in mem_snapshot.statistics("lineno")[:15]:
|
||||||
project_funcs.sort(reverse=True)
|
print(stat)
|
||||||
total = project_funcs[0][0] if project_funcs else 1
|
|
||||||
if not os.path.exists("/tmp/deep_baseline_total"):
|
# GC impact
|
||||||
with open("/tmp/deep_baseline_total", "w") as f:
|
total_gc = sum(gc_times)
|
||||||
f.write(str(total))
|
print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
|
||||||
for ct, tt, name, file, line in project_funcs[:15]:
|
|
||||||
pct = ct / total * 100
|
# CPU top functions (project-only)
|
||||||
print(f" {name:30s} — {pct:5.1f}% cumtime, {tt:.3f}s self")
|
print(f"\n=== CPU: Top project functions (source root: {source_root}) ===")
|
||||||
|
p = pstats.Stats(prof_path)
|
||||||
|
project_funcs = []
|
||||||
|
for (file, line, name), (cc, nc, tt, ct, callers) in p.stats.items():
|
||||||
|
if not os.path.abspath(file).startswith(source_root):
|
||||||
|
continue
|
||||||
|
project_funcs.append((ct, tt, nc, name, file, line))
|
||||||
|
project_funcs.sort(reverse=True)
|
||||||
|
total = project_funcs[0][0] if project_funcs else 1
|
||||||
|
|
||||||
|
# Baseline delta tracking
|
||||||
|
if not os.path.exists(BASELINE_PATH):
|
||||||
|
with open(BASELINE_PATH, "w") as f:
|
||||||
|
f.write(str(total))
|
||||||
|
print(f" (baseline recorded: {total:.3f}s)")
|
||||||
|
else:
|
||||||
|
with open(BASELINE_PATH) as f:
|
||||||
|
baseline = float(f.read().strip())
|
||||||
|
delta = (total - baseline) / baseline * 100
|
||||||
|
print(
|
||||||
|
f" (baseline: {baseline:.3f}s, current: {total:.3f}s, delta: {delta:+.1f}%)"
|
||||||
|
)
|
||||||
|
|
||||||
|
for ct, tt, nc, name, file, line in project_funcs[:15]:
|
||||||
|
pct = ct / total * 100
|
||||||
|
relpath = os.path.relpath(file)
|
||||||
|
print(
|
||||||
|
f" {name:30s} {pct:5.1f}% cumtime {tt:.3f}s self {nc:>6d} calls {relpath}:{line}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
source_root, cmd = parse_args(sys.argv[1:])
|
||||||
|
result, mem_snapshot, gc_times, prof_path = profile_command(cmd)
|
||||||
|
report_results(source_root, mem_snapshot, gc_times, prof_path)
|
||||||
|
sys.exit(result.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue