From 7e00007569066fe2aeb14f2fc7c38d0055a17140 Mon Sep 17 00:00:00 2001
From: Kevin Turcios <106575910+KRRT7@users.noreply.github.com>
Date: Wed, 15 Apr 2026 04:11:52 -0500
Subject: [PATCH] Improve deep optimizer: profiling script + failure modes +
 dist fix (#24)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Exclude dev docs from plugin dist builds

README.md, ARCHITECTURE.md, and ROADMAP.md are development docs that
shouldn't ship in the assembled plugin distributions.

* Improve deep optimizer: fix profiling script, add failure mode awareness

Profiling script: Accept source root and command as CLI args instead of
hardcoding `src` and requiring manual `# === RUN TARGET HERE ===` edits.
The agent now copies the script from references and runs it with the
project's actual source root and test command.

Failure modes: Wire failure-modes.md into the on-demand reference table
and stuck recovery checklist so the agent consults it when workflows
break (deadlocks, silent failures, context loss, stale results).

* Fix ruff lint errors in unified profiling script

Refactor main() into parse_args(), profile_command(), and
report_results() to fix C901 (complexity) and PLR0915 (too many
statements). Also fix S306 (mktemp → NamedTemporaryFile), PLW1510
(explicit check=False), and add noqa for intentional os.path usage
(PTH112) and subprocess with CLI args (S603).
---
 Makefile                                      |   2 +-
 .../languages/python/agents/codeflash-deep.md |  77 +++-----
 .../references/unified-profiling-script.py    | 165 +++++++++++++-----
 3 files changed, 149 insertions(+), 95 deletions(-)

diff --git a/Makefile b/Makefile
index fd9e60d..0125c52 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ LANGS := $(notdir $(wildcard plugin/languages/*))
 build: clean
 	@for lang in $(LANGS); do \
 		echo "Assembling plugin ($$lang) → dist-$$lang/"; \
-		rsync -a --exclude='languages/' plugin/ dist-$$lang/; \
+		rsync -a --exclude='languages/' --exclude='README.md' --exclude='ARCHITECTURE.md' --exclude='ROADMAP.md' plugin/ dist-$$lang/; \
 		rsync -a plugin/languages/$$lang/agents/ dist-$$lang/agents/; \
 		rsync -a plugin/languages/$$lang/references/ dist-$$lang/references/; \
 		rsync -a plugin/languages/$$lang/skills/ dist-$$lang/skills/; \
diff --git a/plugin/languages/python/agents/codeflash-deep.md b/plugin/languages/python/agents/codeflash-deep.md
index 85c6c79..66b4873 100644
--- a/plugin/languages/python/agents/codeflash-deep.md
+++ b/plugin/languages/python/agents/codeflash-deep.md
@@ -166,60 +166,33 @@ You MUST profile before making any code changes. The unified profiling script be
 
 ### Unified CPU + Memory profiling (MANDATORY first step)
 
-This gives you the cross-domain view that single-domain agents lack.
+This gives you the cross-domain view that single-domain agents lack. The script lives at `${CLAUDE_PLUGIN_ROOT}/languages/python/references/unified-profiling-script.py` — copy it to `/tmp/deep_profile.py` and run it.
 
-```python
-# /tmp/deep_profile.py
-import cProfile, tracemalloc, gc, time, pstats, os, sys
-
-# Track GC to quantify allocation→CPU interaction
-gc_times = []
-def gc_callback(phase, info):
-    if phase == 'start':
-        gc_callback._start = time.perf_counter()
-    elif phase == 'stop':
-        gc_times.append(time.perf_counter() - gc_callback._start)
-gc.callbacks.append(gc_callback)
-
-tracemalloc.start()
-profiler = cProfile.Profile()
-
-profiler.enable()
-# === RUN TARGET HERE ===
-profiler.disable()
-
-mem_snapshot = tracemalloc.take_snapshot()
-profiler.dump_stats('/tmp/deep_cpu.prof')
-
-# Memory top allocators
-print("=== MEMORY: Top allocators ===")
-for stat in mem_snapshot.statistics('lineno')[:15]:
-    print(stat)
-
-# GC impact
-total_gc = sum(gc_times)
-print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
-
-# CPU top functions (project-only)
-print("\n=== CPU: Top project functions ===")
-p = pstats.Stats('/tmp/deep_cpu.prof')
-stats = p.stats
-src = os.path.abspath('src')  # adjust to project source root
-project_funcs = []
-for (file, line, name), (cc, nc, tt, ct, callers) in stats.items():
-    if not os.path.abspath(file).startswith(src):
-        continue
-    project_funcs.append((ct, tt, name, file, line))
-project_funcs.sort(reverse=True)
-total = project_funcs[0][0] if project_funcs else 1
-if not os.path.exists('/tmp/deep_baseline_total'):
-    with open('/tmp/deep_baseline_total', 'w') as f:
-        f.write(str(total))
-for ct, tt, name, file, line in project_funcs[:15]:
-    pct = ct / total * 100
-    print(f"  {name:30s} — {pct:5.1f}% cumtime, {tt:.3f}s self")
+```bash
+cp "${CLAUDE_PLUGIN_ROOT}/languages/python/references/unified-profiling-script.py" /tmp/deep_profile.py
 ```
 
+**Usage:** `$RUNNER /tmp/deep_profile.py <source_root> -- <command> [args...]`
+
+- `<source_root>` — directory containing project source. Only functions under this path appear in the CPU report. Read this from `.codeflash/setup.md` (the package directory name — e.g., `src`, `mypackage`, or `.` to include everything).
+- Everything after `--` is the command to profile.
+
+**Examples:**
+```bash
+# Profile a specific test
+$RUNNER /tmp/deep_profile.py src -- pytest tests/test_pipeline.py -x
+
+# Profile a benchmark script
+$RUNNER /tmp/deep_profile.py mypackage -- python scripts/benchmark.py
+
+# Profile an import + function call
+$RUNNER /tmp/deep_profile.py . -- python -c "from mypackage import run; run()"
+```
+
+The script reports: top memory allocators (tracemalloc), GC collection count and total time, and top project functions by cumtime with call counts and file locations. On the first run it records a baseline total; subsequent runs print the delta percentage.
+
+**Choosing what to profile:** Use the test or benchmark that exercises the code path the user cares about. If the user said "make X faster", profile whatever runs X. If they gave a general request, use the project's test suite or a representative benchmark. Do NOT profile `import mypackage` unless the user specifically asked about import/startup time.
+
 ### Building the unified target table
 
 After the unified profile, cross-reference CPU hotspots with memory allocators to identify multi-domain targets:
@@ -305,6 +278,7 @@ When you encounter a domain-specific pattern, consult the domain reference for t
 | Sequential awaits, blocking calls, async patterns | `../references/async/guide.md` |
 | Import time, circular deps, module structure | `../references/structure/guide.md` |
 | After KEEP, authoritative e2e measurement | `${CLAUDE_PLUGIN_ROOT}/references/shared/e2e-benchmarks.md` |
+| Stuck, teammates stalled, context lost, workflow broken | `${CLAUDE_PLUGIN_ROOT}/references/shared/failure-modes.md` |
 
 **Read on demand, not upfront.** Only load a reference when you've identified a concrete pattern through profiling. This keeps your context focused.
 
@@ -527,6 +501,7 @@ If 5+ consecutive discards across all dimensions and strategies:
 4. **Try the opposite.** If fine-grained fixes keep failing, try a coarser architectural change that spans domains.
 5. **Check for missed interactions.** Run gc.callbacks if you haven't — the GC→CPU interaction is the most commonly missed.
 6. **Re-read original goal.** Has the focus drifted?
+7. **Consult failure modes.** Read `${CLAUDE_PLUGIN_ROOT}/references/shared/failure-modes.md` for known workflow failure patterns — deadlocks, silent teammate failures, context loss after compaction, stale results, and ambiguous completion criteria. These are structural problems that look like being stuck but have specific recovery procedures.
 
 If still stuck after 3 more experiments, **stop and report** with a comprehensive cross-domain analysis of why the code is at its floor.
 
diff --git a/plugin/languages/python/references/unified-profiling-script.py b/plugin/languages/python/references/unified-profiling-script.py
index 6358b23..6ccb227 100644
--- a/plugin/languages/python/references/unified-profiling-script.py
+++ b/plugin/languages/python/references/unified-profiling-script.py
@@ -1,65 +1,144 @@
-# /tmp/deep_profile.py
 # Unified CPU + Memory + GC profiling script for the primary optimizer.
 # This is the MANDATORY first step — gives the cross-domain view that
 # single-domain agents lack.
 #
-# Usage: Adapt the "RUN TARGET HERE" section for your test/benchmark,
-# then run with: $RUNNER /tmp/deep_profile.py
+# Usage:
+#   $RUNNER /tmp/deep_profile.py <source_root> -- <command> [args...]
+#
+# Examples:
+#   $RUNNER /tmp/deep_profile.py src -- pytest tests/test_pipeline.py -x
+#   $RUNNER /tmp/deep_profile.py mypackage -- python -c "from mypackage import run; run()"
+#   $RUNNER /tmp/deep_profile.py . -- python scripts/benchmark.py
+#
+# <source_root> is the directory containing project source code. Only
+# functions under this path appear in the CPU report. Read this from
+# .codeflash/setup.md ("Project root" + the package directory), or pass
+# "." to include everything.
+#
+# Everything after "--" is the command to profile. The script runs it as
+# a subprocess under cProfile + tracemalloc + GC tracking.
 
 import cProfile
 import gc
 import os
 import pstats
+import subprocess
+import sys
+import tempfile
 import time
 import tracemalloc
 
-# Track GC to quantify allocation→CPU interaction
-gc_times = []
+BASELINE_PATH = "/tmp/deep_baseline_total"
 
 
-def gc_callback(phase, info):
-    if phase == "start":
-        gc_callback._start = time.perf_counter()
-    elif phase == "stop":
-        gc_times.append(time.perf_counter() - gc_callback._start)
+def parse_args(args):
+    """Parse <source_root> -- <command...> from argv."""
+    if "--" not in args or len(args) < 3:  # noqa: PLR2004
+        print(
+            "Usage: python deep_profile.py <source_root> -- <command> [args...]",
+            file=sys.stderr,
+        )
+        print(
+            "Example: python deep_profile.py src -- pytest tests/ -x",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    sep = args.index("--")
+    source_root = os.path.abspath(args[sep - 1])
+    cmd = args[sep + 1 :]
+
+    if not os.path.isdir(source_root):  # noqa: PTH112
+        print(
+            f"Error: source root '{source_root}' is not a directory",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    return source_root, cmd
 
 
-gc.callbacks.append(gc_callback)
+def profile_command(cmd):
+    """Run cmd under cProfile + tracemalloc + GC tracking."""
+    gc_times = []
 
-tracemalloc.start()
-profiler = cProfile.Profile()
+    def gc_callback(phase, _info):
+        if phase == "start":
+            gc_callback._start = time.perf_counter()
+        elif phase == "stop":
+            gc_times.append(time.perf_counter() - gc_callback._start)
 
-profiler.enable()
-# === RUN TARGET HERE ===
-profiler.disable()
+    gc.callbacks.append(gc_callback)
+    tracemalloc.start()
+    profiler = cProfile.Profile()
 
-mem_snapshot = tracemalloc.take_snapshot()
-profiler.dump_stats("/tmp/deep_cpu.prof")
+    profiler.enable()
+    result = subprocess.run(cmd, check=False)  # noqa: S603
+    profiler.disable()
 
-# Memory top allocators
-print("=== MEMORY: Top allocators ===")
-for stat in mem_snapshot.statistics("lineno")[:15]:
-    print(stat)
+    mem_snapshot = tracemalloc.take_snapshot()
+    with tempfile.NamedTemporaryFile(
+        suffix=".prof", prefix="deep_cpu_", delete=False
+    ) as prof_file:
+        prof_path = prof_file.name
+    profiler.dump_stats(prof_path)
 
-# GC impact
-total_gc = sum(gc_times)
-print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
+    return result, mem_snapshot, gc_times, prof_path
 
-# CPU top functions (project-only)
-print("\n=== CPU: Top project functions ===")
-p = pstats.Stats("/tmp/deep_cpu.prof")
-stats = p.stats
-src = os.path.abspath("src")  # adjust to project source root
-project_funcs = []
-for (file, line, name), (cc, nc, tt, ct, callers) in stats.items():
-    if not os.path.abspath(file).startswith(src):
-        continue
-    project_funcs.append((ct, tt, name, file, line))
-project_funcs.sort(reverse=True)
-total = project_funcs[0][0] if project_funcs else 1
-if not os.path.exists("/tmp/deep_baseline_total"):
-    with open("/tmp/deep_baseline_total", "w") as f:
-        f.write(str(total))
-for ct, tt, name, file, line in project_funcs[:15]:
-    pct = ct / total * 100
-    print(f"  {name:30s} — {pct:5.1f}% cumtime, {tt:.3f}s self")
+
+def report_results(source_root, mem_snapshot, gc_times, prof_path):
+    """Print unified profile report."""
+    print("\n" + "=" * 60)
+    print("UNIFIED PROFILE RESULTS")
+    print("=" * 60)
+
+    # Memory top allocators
+    print("\n=== MEMORY: Top allocators ===")
+    for stat in mem_snapshot.statistics("lineno")[:15]:
+        print(stat)
+
+    # GC impact
+    total_gc = sum(gc_times)
+    print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
+
+    # CPU top functions (project-only)
+    print(f"\n=== CPU: Top project functions (source root: {source_root}) ===")
+    p = pstats.Stats(prof_path)
+    project_funcs = []
+    for (file, line, name), (cc, nc, tt, ct, callers) in p.stats.items():
+        if not os.path.abspath(file).startswith(source_root):
+            continue
+        project_funcs.append((ct, tt, nc, name, file, line))
+    project_funcs.sort(reverse=True)
+    total = project_funcs[0][0] if project_funcs else 1
+
+    # Baseline delta tracking
+    if not os.path.exists(BASELINE_PATH):
+        with open(BASELINE_PATH, "w") as f:
+            f.write(str(total))
+        print(f"  (baseline recorded: {total:.3f}s)")
+    else:
+        with open(BASELINE_PATH) as f:
+            baseline = float(f.read().strip())
+        delta = (total - baseline) / baseline * 100
+        print(
+            f"  (baseline: {baseline:.3f}s, current: {total:.3f}s, delta: {delta:+.1f}%)"
+        )
+
+    for ct, tt, nc, name, file, line in project_funcs[:15]:
+        pct = ct / total * 100
+        relpath = os.path.relpath(file)
+        print(
+            f"  {name:30s} {pct:5.1f}% cumtime  {tt:.3f}s self  {nc:>6d} calls  {relpath}:{line}"
+        )
+
+
+def main():
+    source_root, cmd = parse_args(sys.argv[1:])
+    result, mem_snapshot, gc_times, prof_path = profile_command(cmd)
+    report_results(source_root, mem_snapshot, gc_times, prof_path)
+    sys.exit(result.returncode)
+
+
+if __name__ == "__main__":
+    main()