diff --git a/Makefile b/Makefile
index fd9e60d..0125c52 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ LANGS := $(notdir $(wildcard plugin/languages/*))
 build: clean
 	@for lang in $(LANGS); do \
 		echo "Assembling plugin ($$lang) → dist-$$lang/"; \
-		rsync -a --exclude='languages/' plugin/ dist-$$lang/; \
+		rsync -a --exclude='languages/' --exclude='README.md' --exclude='ARCHITECTURE.md' --exclude='ROADMAP.md' plugin/ dist-$$lang/; \
 		rsync -a plugin/languages/$$lang/agents/ dist-$$lang/agents/; \
 		rsync -a plugin/languages/$$lang/references/ dist-$$lang/references/; \
 		rsync -a plugin/languages/$$lang/skills/ dist-$$lang/skills/; \
diff --git a/plugin/languages/python/agents/codeflash-deep.md b/plugin/languages/python/agents/codeflash-deep.md
index 85c6c79..66b4873 100644
--- a/plugin/languages/python/agents/codeflash-deep.md
+++ b/plugin/languages/python/agents/codeflash-deep.md
@@ -166,60 +166,33 @@ You MUST profile before making any code changes. The unified profiling script be
 
 ### Unified CPU + Memory profiling (MANDATORY first step)
 
-This gives you the cross-domain view that single-domain agents lack.
+This gives you the cross-domain view that single-domain agents lack. The script lives at `${CLAUDE_PLUGIN_ROOT}/languages/python/references/unified-profiling-script.py` — copy it to `/tmp/deep_profile.py` and run it.
 
-```python
-# /tmp/deep_profile.py
-import cProfile, tracemalloc, gc, time, pstats, os, sys
-
-# Track GC to quantify allocation→CPU interaction
-gc_times = []
-def gc_callback(phase, info):
-    if phase == 'start':
-        gc_callback._start = time.perf_counter()
-    elif phase == 'stop':
-        gc_times.append(time.perf_counter() - gc_callback._start)
-gc.callbacks.append(gc_callback)
-
-tracemalloc.start()
-profiler = cProfile.Profile()
-
-profiler.enable()
-# === RUN TARGET HERE ===
-profiler.disable()
-
-mem_snapshot = tracemalloc.take_snapshot()
-profiler.dump_stats('/tmp/deep_cpu.prof')
-
-# Memory top allocators
-print("=== MEMORY: Top allocators ===")
-for stat in mem_snapshot.statistics('lineno')[:15]:
-    print(stat)
-
-# GC impact
-total_gc = sum(gc_times)
-print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
-
-# CPU top functions (project-only)
-print("\n=== CPU: Top project functions ===")
-p = pstats.Stats('/tmp/deep_cpu.prof')
-stats = p.stats
-src = os.path.abspath('src')  # adjust to project source root
-project_funcs = []
-for (file, line, name), (cc, nc, tt, ct, callers) in stats.items():
-    if not os.path.abspath(file).startswith(src):
-        continue
-    project_funcs.append((ct, tt, name, file, line))
-project_funcs.sort(reverse=True)
-total = project_funcs[0][0] if project_funcs else 1
-if not os.path.exists('/tmp/deep_baseline_total'):
-    with open('/tmp/deep_baseline_total', 'w') as f:
-        f.write(str(total))
-for ct, tt, name, file, line in project_funcs[:15]:
-    pct = ct / total * 100
-    print(f"  {name:30s} — {pct:5.1f}% cumtime, {tt:.3f}s self")
+```bash
+cp "${CLAUDE_PLUGIN_ROOT}/languages/python/references/unified-profiling-script.py" /tmp/deep_profile.py
 ```
 
+**Usage:** `$RUNNER /tmp/deep_profile.py <source_root> -- <command> [args...]`
+
+- `<source_root>` — directory containing project source. Only functions under this path appear in the CPU report. Read this from `.codeflash/setup.md` (the package directory name — e.g., `src`, `mypackage`, or `.` to include everything).
+- Everything after `--` is the command to profile.
+
+**Examples:**
+```bash
+# Profile a specific test
+$RUNNER /tmp/deep_profile.py src -- pytest tests/test_pipeline.py -x
+
+# Profile a benchmark script
+$RUNNER /tmp/deep_profile.py mypackage -- python scripts/benchmark.py
+
+# Profile an import + function call
+$RUNNER /tmp/deep_profile.py . -- python -c "from mypackage import run; run()"
+```
+
+The script reports: top memory allocators (tracemalloc), GC collection count and total time, and top project functions by cumtime with call counts and file locations. On the first run it records a baseline total; subsequent runs print the delta percentage.
+
+**Choosing what to profile:** Use the test or benchmark that exercises the code path the user cares about. If the user said "make X faster", profile whatever runs X. If they gave a general request, use the project's test suite or a representative benchmark. Do NOT profile `import mypackage` unless the user specifically asked about import/startup time.
+
 ### Building the unified target table
 
 After the unified profile, cross-reference CPU hotspots with memory allocators to identify multi-domain targets:
@@ -305,6 +278,7 @@ When you encounter a domain-specific pattern, consult the domain reference for t
 | Sequential awaits, blocking calls, async patterns | `../references/async/guide.md` |
 | Import time, circular deps, module structure | `../references/structure/guide.md` |
 | After KEEP, authoritative e2e measurement | `${CLAUDE_PLUGIN_ROOT}/references/shared/e2e-benchmarks.md` |
+| Stuck, teammates stalled, context lost, workflow broken | `${CLAUDE_PLUGIN_ROOT}/references/shared/failure-modes.md` |
 
 **Read on demand, not upfront.** Only load a reference when you've identified a concrete pattern through profiling. This keeps your context focused.
 
@@ -527,6 +501,7 @@ If 5+ consecutive discards across all dimensions and strategies:
 4. **Try the opposite.** If fine-grained fixes keep failing, try a coarser architectural change that spans domains.
 5. **Check for missed interactions.** Run gc.callbacks if you haven't — the GC→CPU interaction is the most commonly missed.
 6. **Re-read original goal.** Has the focus drifted?
+7. **Consult failure modes.** Read `${CLAUDE_PLUGIN_ROOT}/references/shared/failure-modes.md` for known workflow failure patterns — deadlocks, silent teammate failures, context loss after compaction, stale results, and ambiguous completion criteria. These are structural problems that look like being stuck but have specific recovery procedures.
 
 If still stuck after 3 more experiments, **stop and report** with a comprehensive cross-domain analysis of why the code is at its floor.
 
diff --git a/plugin/languages/python/references/unified-profiling-script.py b/plugin/languages/python/references/unified-profiling-script.py
index 6358b23..6ccb227 100644
--- a/plugin/languages/python/references/unified-profiling-script.py
+++ b/plugin/languages/python/references/unified-profiling-script.py
@@ -1,65 +1,144 @@
-# /tmp/deep_profile.py
 # Unified CPU + Memory + GC profiling script for the primary optimizer.
 # This is the MANDATORY first step — gives the cross-domain view that
 # single-domain agents lack.
 #
-# Usage: Adapt the "RUN TARGET HERE" section for your test/benchmark,
-# then run with: $RUNNER /tmp/deep_profile.py
+# Usage:
+#   $RUNNER /tmp/deep_profile.py <source_root> -- <command> [args...]
+#
+# Examples:
+#   $RUNNER /tmp/deep_profile.py src -- pytest tests/test_pipeline.py -x
+#   $RUNNER /tmp/deep_profile.py mypackage -- python -c "from mypackage import run; run()"
+#   $RUNNER /tmp/deep_profile.py . -- python scripts/benchmark.py
+#
+# <source_root> is the directory containing project source code. Only
+# functions under this path appear in the CPU report. Read this from
+# .codeflash/setup.md ("Project root" + the package directory), or pass
+# "." to include everything.
+#
+# Everything after "--" is the command to profile. The script runs it as
+# a subprocess under cProfile + tracemalloc + GC tracking.
 
 import cProfile
 import gc
 import os
 import pstats
+import subprocess
+import sys
+import tempfile
 import time
 import tracemalloc
 
-# Track GC to quantify allocation→CPU interaction
-gc_times = []
+BASELINE_PATH = "/tmp/deep_baseline_total"
 
 
-def gc_callback(phase, info):
-    if phase == "start":
-        gc_callback._start = time.perf_counter()
-    elif phase == "stop":
-        gc_times.append(time.perf_counter() - gc_callback._start)
+def parse_args(args):
+    """Parse <source_root> -- <command...> from argv."""
+    if "--" not in args or len(args) < 3:  # noqa: PLR2004
+        print(
+            "Usage: python deep_profile.py <source_root> -- <command> [args...]",
+            file=sys.stderr,
+        )
+        print(
+            "Example: python deep_profile.py src -- pytest tests/ -x",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    sep = args.index("--")
+    source_root = os.path.abspath(args[sep - 1])
+    cmd = args[sep + 1 :]
+
+    if not os.path.isdir(source_root):  # noqa: PTH112
+        print(
+            f"Error: source root '{source_root}' is not a directory",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+
+    return source_root, cmd
 
 
-gc.callbacks.append(gc_callback)
+def profile_command(cmd):
+    """Run cmd under cProfile + tracemalloc + GC tracking."""
+    gc_times = []
 
-tracemalloc.start()
-profiler = cProfile.Profile()
+    def gc_callback(phase, _info):
+        if phase == "start":
+            gc_callback._start = time.perf_counter()
+        elif phase == "stop":
+            gc_times.append(time.perf_counter() - gc_callback._start)
 
-profiler.enable()
-# === RUN TARGET HERE ===
-profiler.disable()
+    gc.callbacks.append(gc_callback)
+    tracemalloc.start()
+    profiler = cProfile.Profile()
 
-mem_snapshot = tracemalloc.take_snapshot()
-profiler.dump_stats("/tmp/deep_cpu.prof")
+    profiler.enable()
+    result = subprocess.run(cmd, check=False)  # noqa: S603
+    profiler.disable()
 
-# Memory top allocators
-print("=== MEMORY: Top allocators ===")
-for stat in mem_snapshot.statistics("lineno")[:15]:
-    print(stat)
+    mem_snapshot = tracemalloc.take_snapshot()
+    with tempfile.NamedTemporaryFile(
+        suffix=".prof", prefix="deep_cpu_", delete=False
+    ) as prof_file:
+        prof_path = prof_file.name
+    profiler.dump_stats(prof_path)
 
-# GC impact
-total_gc = sum(gc_times)
-print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
+    return result, mem_snapshot, gc_times, prof_path
 
-# CPU top functions (project-only)
-print("\n=== CPU: Top project functions ===")
-p = pstats.Stats("/tmp/deep_cpu.prof")
-stats = p.stats
-src = os.path.abspath("src")  # adjust to project source root
-project_funcs = []
-for (file, line, name), (cc, nc, tt, ct, callers) in stats.items():
-    if not os.path.abspath(file).startswith(src):
-        continue
-    project_funcs.append((ct, tt, name, file, line))
-project_funcs.sort(reverse=True)
-total = project_funcs[0][0] if project_funcs else 1
-if not os.path.exists("/tmp/deep_baseline_total"):
-    with open("/tmp/deep_baseline_total", "w") as f:
-        f.write(str(total))
-for ct, tt, name, file, line in project_funcs[:15]:
-    pct = ct / total * 100
-    print(f"  {name:30s} — {pct:5.1f}% cumtime, {tt:.3f}s self")
+
+def report_results(source_root, mem_snapshot, gc_times, prof_path):
+    """Print unified profile report."""
+    print("\n" + "=" * 60)
+    print("UNIFIED PROFILE RESULTS")
+    print("=" * 60)
+
+    # Memory top allocators
+    print("\n=== MEMORY: Top allocators ===")
+    for stat in mem_snapshot.statistics("lineno")[:15]:
+        print(stat)
+
+    # GC impact
+    total_gc = sum(gc_times)
+    print(f"\n=== GC: {len(gc_times)} collections, {total_gc:.3f}s total ===")
+
+    # CPU top functions (project-only)
+    print(f"\n=== CPU: Top project functions (source root: {source_root}) ===")
+    p = pstats.Stats(prof_path)
+    project_funcs = []
+    for (file, line, name), (cc, nc, tt, ct, callers) in p.stats.items():
+        if not os.path.abspath(file).startswith(source_root):
+            continue
+        project_funcs.append((ct, tt, nc, name, file, line))
+    project_funcs.sort(reverse=True)
+    total = project_funcs[0][0] if project_funcs else 1
+
+    # Baseline delta tracking
+    if not os.path.exists(BASELINE_PATH):
+        with open(BASELINE_PATH, "w") as f:
+            f.write(str(total))
+        print(f"  (baseline recorded: {total:.3f}s)")
+    else:
+        with open(BASELINE_PATH) as f:
+            baseline = float(f.read().strip())
+        delta = (total - baseline) / baseline * 100
+        print(
+            f"  (baseline: {baseline:.3f}s, current: {total:.3f}s, delta: {delta:+.1f}%)"
+        )
+
+    for ct, tt, nc, name, file, line in project_funcs[:15]:
+        pct = ct / total * 100
+        relpath = os.path.relpath(file)
+        print(
+            f"  {name:30s} {pct:5.1f}% cumtime  {tt:.3f}s self  {nc:>6d} calls  {relpath}:{line}"
+        )
+
+
+def main():
+    source_root, cmd = parse_args(sys.argv[1:])
+    result, mem_snapshot, gc_times, prof_path = profile_command(cmd)
+    report_results(source_root, mem_snapshot, gc_times, prof_path)
+    sys.exit(result.returncode)
+
+
+if __name__ == "__main__":
+    main()