feat: add tool activity display and fix streaming timeout in observability chat

Restructure agent loop to use stream()+finalMessage() for all API calls, fixing the SDK's non-streaming timeout error with max_tokens 32k. Add parallel tool execution, tool activity bubbles in the frontend, and restructure the system prompt for better investigation behavior.
2026-05-04 18:25:18 +00:00 · 2026-02-15 01:29:45 -05:00 · 2026-02-15 01:29:45 -05:00 · b09262ccbc
commit b09262ccbc
parent 51372ca0ad
3 changed files with 332 additions and 132 deletions
--- a/js/cf-webapp/src/app/api/observability/chat/route.ts
+++ b/js/cf-webapp/src/app/api/observability/chat/route.ts
@ -17,6 +17,18 @@ interface ChatMessage {
 const MAX_TOOL_ROUNDS = 15
 const KEEPALIVE_INTERVAL_MS = 15_000

+const TOOL_DISPLAY_NAMES: Record<string, string> = {
+  get_original_code: "Reading original code",
+  get_candidate_code: "Reading candidate code",
+  get_test_code: "Reading test code",
+  get_ranking_details: "Reading ranking details",
+  get_errors: "Checking errors",
+  get_llm_call_detail: "Inspecting LLM call",
+  read_file: "Reading file",
+  search_code: "Searching codebase",
+  list_directory: "Listing directory",
+}
+
 function getClient(): Anthropic {
  const baseURL = process.env.ANTHROPIC_FOUNDRY_BASE_URL
  const apiKey = process.env.AZURE_OPENAI_API_KEY
@ -26,26 +38,99 @@ function getClient(): Anthropic {
  return new Anthropic({ baseURL, apiKey })
 }

-async function processToolUseResponse(
-  response: Anthropic.Message,
-  indexed: IndexedTraceData
+function summarizeToolResult(toolName: string, result: string): string {
+  const lines = result.split("\n").filter(Boolean)
+  switch (toolName) {
+    case "search_code": {
+      if (result === "No matches found.") return "No matches found"
+      return `Found ${lines.length} matching lines`
+    }
+    case "read_file": {
+      const header = lines[0] ?? ""
+      return header.startsWith("File:") ? header : `Read ${lines.length} lines`
+    }
+    case "get_errors": {
+      if (result === "No errors in this trace.") return "No errors"
+      const count = lines.filter((l) => l.startsWith("[")).length
+      return `Found ${count} errors`
+    }
+    case "get_llm_call_detail":
+      return "Loaded full LLM call details"
+    case "list_directory": {
+      const count = lines.length - 1 // subtract header line
+      return `Listed ${count} entries`
+    }
+    default:
+      return "Done"
+  }
+}
+
+async function processToolCalls(
+  content: Anthropic.ContentBlock[],
+  indexed: IndexedTraceData,
+  enqueue: (data: string) => void,
 ): Promise<Anthropic.ToolResultBlockParam[]> {
-  const toolResults: Anthropic.ToolResultBlockParam[] = []
-  for (const block of response.content) {
-    if (block.type === "tool_use") {
+  const toolUseBlocks = content.filter(
+    (block): block is Anthropic.ToolUseBlock => block.type === "tool_use",
+  )
+  if (toolUseBlocks.length === 0) return []
+
+  // Emit tool_start events for all tools
+  for (const block of toolUseBlocks) {
+    enqueue(
+      `data: ${JSON.stringify({
+        type: "tool_start",
+        tool: block.name,
+        displayName: TOOL_DISPLAY_NAMES[block.name] ?? block.name,
+      })}\n\n`,
+    )
+  }
+
+  // Execute all tool calls in parallel
+  const results = await Promise.all(
+    toolUseBlocks.map(async (block) => {
      const result = await resolveToolCall(
        block.name,
        (block.input as Record<string, unknown>) ?? {},
-        indexed
+        indexed,
      )
-      toolResults.push({
-        type: "tool_result",
+
+      enqueue(
+        `data: ${JSON.stringify({
+          type: "tool_result",
+          tool: block.name,
+          displayName: TOOL_DISPLAY_NAMES[block.name] ?? block.name,
+          summary: summarizeToolResult(block.name, result),
+        })}\n\n`,
+      )
+
+      return {
+        type: "tool_result" as const,
        tool_use_id: block.id,
        content: result,
-      })
-    }
+      }
+    }),
+  )
+
+  return results
+}
+
+// Shared base params for both streaming and non-streaming calls.
+// Adaptive thinking lets Claude decide how much reasoning is needed per request.
+// On Opus 4.6, this automatically enables interleaved thinking (thinking between
+// tool calls) without needing a beta header.
+function baseParams(
+  systemPrompt: string,
+  conversationMessages: Anthropic.MessageParam[],
+) {
+  return {
+    model: "claude-opus-4-6" as const,
+    max_tokens: 32_000,
+    system: systemPrompt,
+    messages: conversationMessages,
+    tools: anthropicToolDefinitions as Anthropic.Tool[],
+    thinking: { type: "adaptive" as const },
  }
-  return toolResults
 }

 export async function POST(request: NextRequest): Promise<Response> {
@ -55,7 +140,7 @@ export async function POST(request: NextRequest): Promise<Response> {
  } catch (err) {
    return Response.json(
      { error: err instanceof Error ? err.message : "Configuration error" },
-      { status: 500 }
+      { status: 500 },
    )
  }

@ -70,7 +155,7 @@ export async function POST(request: NextRequest): Promise<Response> {
  if (!traceId || !messages?.length) {
    return Response.json(
      { error: "traceId and messages are required" },
-      { status: 400 }
+      { status: 400 },
    )
  }

@ -82,7 +167,6 @@ export async function POST(request: NextRequest): Promise<Response> {

  const indexed = indexTraceData(traceData)
  const systemPrompt = buildSummaryPrompt(indexed)
-
  const conversationMessages: Anthropic.MessageParam[] = messages.map((m) => ({
    role: m.role,
    content: m.content,
@ -96,59 +180,44 @@ export async function POST(request: NextRequest): Promise<Response> {
      const keepalive = setInterval(() => enqueue(": keepalive\n\n"), KEEPALIVE_INTERVAL_MS)

      try {
-        // Tool resolution loop — sends keepalive pings to prevent gateway timeout
+        // Tool resolution loop — uses stream() + finalMessage() to avoid the SDK's
+        // non-streaming timeout limit (max_tokens 32k estimates >10min, which blocks
+        // client.messages.create()). No text handler is attached, so no text reaches
+        // the frontend — avoiding "stutter" from intermediate fragments.
        let toolRounds = 0
        while (toolRounds < MAX_TOOL_ROUNDS) {
-          const response = await client.messages.create({
-            model: "claude-opus-4-6",
-            max_tokens: 4096,
-            system: systemPrompt,
-            messages: conversationMessages,
-            tools: anthropicToolDefinitions as Anthropic.Tool[],
-          })
+          const toolStream = client.messages.stream(baseParams(systemPrompt, conversationMessages))
+          const response = await toolStream.finalMessage()

-          if (response.stop_reason !== "tool_use") {
-            break
-          }
+          if (response.stop_reason !== "tool_use") break

          conversationMessages.push({ role: "assistant", content: response.content })
-          const toolResults = await processToolUseResponse(response, indexed)
+          const toolResults = await processToolCalls(response.content, indexed, enqueue)
          conversationMessages.push({ role: "user", content: toolResults })
          toolRounds++
        }

-        // Stream the final response
-        const messageStream = client.messages.stream({
-          model: "claude-opus-4-6",
-          max_tokens: 4096,
-          system: systemPrompt,
-          messages: conversationMessages,
-          tools: anthropicToolDefinitions as Anthropic.Tool[],
-        })
+        // Stream the final response — this is the only text the user sees
+        const messageStream = client.messages.stream(baseParams(systemPrompt, conversationMessages))

        messageStream.on("text", (textDelta) => {
-          enqueue(`data: ${JSON.stringify({ text: textDelta })}\n\n`)
+          enqueue(`data: ${JSON.stringify({ type: "text", text: textDelta })}\n\n`)
        })

        const finalMessage = await messageStream.finalMessage()

+        // Edge case: final streaming response ended with tool_use. Process tools
+        // and make one more streaming call without tools to get a text response.
        if (finalMessage.stop_reason === "tool_use") {
          conversationMessages.push({ role: "assistant", content: finalMessage.content })
-          const toolResults = await processToolUseResponse(finalMessage, indexed)
+          const toolResults = await processToolCalls(finalMessage.content, indexed, enqueue)
          conversationMessages.push({ role: "user", content: toolResults })

-          const followUpStream = client.messages.stream({
-            model: "claude-opus-4-6",
-            max_tokens: 4096,
-            system: systemPrompt,
-            messages: conversationMessages,
+          const followUp = client.messages.stream(baseParams(systemPrompt, conversationMessages))
+          followUp.on("text", (textDelta) => {
+            enqueue(`data: ${JSON.stringify({ type: "text", text: textDelta })}\n\n`)
          })
-
-          followUpStream.on("text", (textDelta) => {
-            enqueue(`data: ${JSON.stringify({ text: textDelta })}\n\n`)
-          })
-
-          await followUpStream.finalMessage()
+          await followUp.finalMessage()
        }

        enqueue("data: [DONE]\n\n")
--- a/js/cf-webapp/src/app/observability/components/timeline-chat.tsx
+++ b/js/cf-webapp/src/app/observability/components/timeline-chat.tsx
@ -11,6 +11,13 @@ interface ChatMessage {
  content: string
 }

+interface ToolStep {
+  tool: string
+  displayName: string
+  status: "running" | "done"
+  summary?: string
+}
+
 interface TimelineChatProps {
  traceId: string
  isOpen: boolean
@ -25,6 +32,8 @@ export const TimelineChat = memo(function TimelineChat({
  const [messages, setMessages] = useState<ChatMessage[]>([])
  const [input, setInput] = useState("")
  const [isStreaming, setIsStreaming] = useState(false)
+  const [completedRounds, setCompletedRounds] = useState<ToolStep[][]>([])
+  const [activeSteps, setActiveSteps] = useState<ToolStep[]>([])
  const messagesEndRef = useRef<HTMLDivElement>(null)
  const inputRef = useRef<HTMLTextAreaElement>(null)
  const abortRef = useRef<AbortController | null>(null)
@ -35,7 +44,7 @@ export const TimelineChat = memo(function TimelineChat({

  useEffect(() => {
    scrollToBottom()
-  }, [messages, scrollToBottom])
+  }, [messages, completedRounds, activeSteps, scrollToBottom])

  useEffect(() => {
    if (isOpen) {
@ -57,6 +66,8 @@ export const TimelineChat = memo(function TimelineChat({
    abortRef.current = controller

    setMessages((prev) => [...prev, { role: "assistant", content: "" }])
+    setCompletedRounds([])
+    setActiveSteps([])

    try {
      const res = await fetch("/api/observability/chat", {
@ -92,14 +103,52 @@ export const TimelineChat = memo(function TimelineChat({

          try {
            const parsed = JSON.parse(data)
-            if (parsed.text) {
+
+            // Handle typed events (new protocol)
+            if (parsed.type === "tool_start") {
+              setActiveSteps((prev) => {
+                // If all previous steps are done, this is a new round — commit previous steps
+                if (prev.length > 0 && prev.every((s) => s.status === "done")) {
+                  setCompletedRounds((rounds) => [...rounds, prev])
+                  return [{
+                    tool: parsed.tool,
+                    displayName: parsed.displayName ?? parsed.tool,
+                    status: "running",
+                  }]
+                }
+                return [
+                  ...prev,
+                  {
+                    tool: parsed.tool,
+                    displayName: parsed.displayName ?? parsed.tool,
+                    status: "running",
+                  },
+                ]
+              })
+              continue
+            }
+
+            if (parsed.type === "tool_result") {
+              setActiveSteps((prev) =>
+                prev.map((step) =>
+                  step.tool === parsed.tool && step.status === "running"
+                    ? { ...step, status: "done", summary: parsed.summary }
+                    : step,
+                ),
+              )
+              continue
+            }
+
+            // Handle text — both new {type: "text", text} and old {text} formats
+            const textContent = parsed.type === "text" ? parsed.text : parsed.text
+            if (textContent) {
              setMessages((prev) => {
                const updated = [...prev]
                const last = updated[updated.length - 1]
                if (last?.role === "assistant") {
                  updated[updated.length - 1] = {
                    ...last,
-                    content: last.content + parsed.text,
+                    content: last.content + textContent,
                  }
                }
                return updated
@ -129,6 +178,13 @@ export const TimelineChat = memo(function TimelineChat({
        return updated
      })
    } finally {
+      // Commit any remaining active steps as a final completed round
+      setActiveSteps((prev) => {
+        if (prev.length > 0) {
+          setCompletedRounds((rounds) => [...rounds, prev])
+        }
+        return []
+      })
      setIsStreaming(false)
      abortRef.current = null
    }
@ -208,6 +264,13 @@ export const TimelineChat = memo(function TimelineChat({
        {messages.map((msg, i) => (
          <ChatBubble key={i} message={msg} isStreaming={isStreaming && i === messages.length - 1} />
        ))}
+
+        {completedRounds.map((steps, i) => (
+          <ToolRoundBubble key={`round-${i}`} steps={steps} />
+        ))}
+
+        {activeSteps.length > 0 && <ToolRoundBubble steps={activeSteps} isActive />}
+
        <div ref={messagesEndRef} />
      </div>

@ -253,6 +316,56 @@ export const TimelineChat = memo(function TimelineChat({
  )
 })

+const ToolRoundBubble = memo(function ToolRoundBubble({
+  steps,
+  isActive = false,
+}: {
+  steps: ToolStep[]
+  isActive?: boolean
+}) {
+  const allDone = steps.every((s) => s.status === "done")
+
+  return (
+    <div className="flex gap-3">
+      <div
+        className={`flex-shrink-0 w-7 h-7 rounded-full flex items-center justify-center ${
+          isActive && !allDone
+            ? "bg-amber-100 dark:bg-amber-900/40"
+            : "bg-zinc-100 dark:bg-zinc-800"
+        }`}
+      >
+        {isActive && !allDone ? (
+          <Loader2 className="h-3.5 w-3.5 animate-spin text-amber-600 dark:text-amber-400" />
+        ) : (
+          <Check className="h-3.5 w-3.5 text-green-500" />
+        )}
+      </div>
+      <div className="max-w-[85%] rounded-lg px-3 py-2 bg-zinc-50 dark:bg-zinc-800/50 border border-zinc-200 dark:border-zinc-700">
+        <div className="flex flex-col gap-1">
+          {steps.map((step, i) => (
+            <div
+              key={`${step.tool}-${i}`}
+              className="flex items-center gap-2 text-xs text-zinc-500 dark:text-zinc-400"
+            >
+              {step.status === "running" ? (
+                <Loader2 className="h-3 w-3 animate-spin flex-shrink-0 text-amber-500" />
+              ) : (
+                <Check className="h-3 w-3 text-green-500 flex-shrink-0" />
+              )}
+              <span>
+                {step.displayName}
+                {step.summary && (
+                  <span className="text-zinc-400 dark:text-zinc-500"> — {step.summary}</span>
+                )}
+              </span>
+            </div>
+          ))}
+        </div>
+      </div>
+    </div>
+  )
+})
+
 const ChatBubble = memo(function ChatBubble({
  message,
  isStreaming,
--- a/js/cf-webapp/src/app/observability/lib/build-chat-context.ts
+++ b/js/cf-webapp/src/app/observability/lib/build-chat-context.ts
@ -168,75 +168,56 @@ function truncate(s: string, max: number): string {
 export function buildSummaryPrompt(data: IndexedTraceData): string {
  const lines: string[] = []

+  // === SECTION 1: ROLE + CRITICAL BEHAVIOR (primacy — first thing the model sees) ===
  lines.push(
-    "You are an assistant helping a developer understand an optimization trace from Codeflash. " +
-      "You have tools to fetch specific data from the trace on demand. Use them to answer the user's " +
-      "questions — only fetch what you need. Be concise and reference specific candidates when relevant.\n\n" +
-      "IMPORTANT: The user may paste errors, warnings, logs, or other output from their CLI runs that " +
-      "are NOT captured in the trace data you have access to. The trace data is incomplete — it does not " +
-      "record everything that happens during an optimization run. When the user shares information, trust " +
-      "it as ground truth even if the trace data doesn't confirm it. Never say 'the trace shows no errors' " +
-      "to contradict what the user is reporting. Instead, use the trace data you DO have to help explain " +
-      "and investigate what they're seeing.\n\n" +
-      "You also have codebase browsing tools (read_file, search_code, list_directory) that let you " +
-      "navigate the codeflash-internal and codeflash CLI source code, and a get_llm_call_detail tool " +
-      "to inspect the full prompts and responses of any LLM call in this trace. Use these to trace " +
-      "problems end-to-end: see what went wrong in the trace, read the actual prompts sent, then " +
-      "navigate to the pipeline code to suggest a concrete fix.",
+    "You are an investigation agent for Codeflash optimization traces. Your job is to diagnose " +
+      "problems, trace them to root causes in the code, and recommend specific fixes.\n\n" +
+      "CRITICAL RULES:\n" +
+      "1. You MUST use tools before answering any diagnostic question. Never speculate when you can look.\n" +
+      "2. Use tools liberally. It is always better to over-investigate than to give a shallow answer.\n" +
+      "3. When multiple tool calls are independent, call them in parallel to save time.\n" +
+      "4. Trust user-provided information (CLI output, errors, logs) as ground truth, even when trace " +
+      "data doesn't confirm it. The trace is incomplete — it does not capture everything.\n" +
+      "5. Your output should be concise and reference specific candidates, file paths, and line numbers.",
  )

+  // === SECTION 2: TOOL REFERENCE (when to use each) ===
  lines.push("")
-  lines.push("=== DEBUGGING TOOLS — USE THESE PROACTIVELY ===")
+  lines.push("=== TOOL REFERENCE ===")
  lines.push(
-    "You have tools beyond trace data. Your job is not just to describe what happened — it's to " +
-      "investigate WHY it happened and point to the specific code or prompt that needs to change. " +
-      "Always go one level deeper than the surface-level observation.\n\n" +
-      "IMPORTANT: When you identify a problem (bad tests, failed optimizations, parsing errors, etc.), " +
-      "you MUST use get_llm_call_detail to inspect the actual prompts and responses involved. Then, if " +
-      "the issue traces back to a prompt or pipeline bug, use the codebase browsing tools to find the " +
-      "source code and suggest a concrete fix. Do not stop at 'the tests used mocks' — find out what " +
-      "prompt instructions led to that and where to fix them.\n\n" +
-      "=== get_llm_call_detail(call_id) ===\n" +
-      "Fetches the full system prompt, user prompt, raw LLM response, and parsing results for any " +
-      "LLM call in this trace. You SHOULD use this:\n" +
-      "- When analyzing test quality: inspect the testgen prompt to see what instructions the model " +
-      "received. Did the prompt forbid mocks? Did it provide enough context about the classes?\n" +
-      "- When investigating bad optimizations: read the optimizer prompt to check if context was " +
-      "missing or if instructions were unclear\n" +
-      "- When debugging parsing failures: compare raw_response vs parsed_response to find extraction bugs\n" +
-      "- When understanding ranking decisions: read the ranker prompt and response\n\n" +
-      "=== read_file, search_code, list_directory ===\n" +
-      "Browse the codeflash-internal and codeflash (CLI) source repos. You SHOULD use these:\n" +
-      "- After inspecting an LLM call, find the prompt template to suggest a specific fix\n" +
-      "- To understand how a pipeline stage works (postprocessing, deduplication, instrumentation)\n" +
-      "- To trace a code path from an LLM call back to the pipeline logic that invoked it\n" +
-      "- When the user asks 'where does X happen' or 'why does Y behave this way'\n\n" +
+    "TRACE DATA TOOLS (for understanding what happened):\n" +
+      "• get_errors — Start here. Shows all errors from the optimization (test failures, parsing errors). " +
+      "Use this first to understand what went wrong before diving deeper.\n" +
+      "• get_original_code — Returns the original function source. Use when you need to understand what " +
+      "was being optimized.\n" +
+      "• get_candidate_code — Returns a specific optimization candidate with its explanation, model, and " +
+      "rank. Use when investigating why a candidate was ranked low or failed.\n" +
+      "• get_test_code — Returns generated/instrumented test code. Use when investigating test quality " +
+      "issues (mocks, bad assertions, missing coverage).\n" +
+      "• get_ranking_details — Returns the full ranking explanation. Use when investigating why candidates " +
+      "were ranked the way they were.\n\n" +
+      "DEBUGGING TOOLS (for understanding WHY it happened):\n" +
+      "• get_llm_call_detail — YOUR PRIMARY DEBUGGING TOOL. Fetches the full system prompt, user prompt, " +
+      "raw response, and parsing results for any LLM call. Use this FIRST when investigating any " +
+      "LLM-related issue: bad tests, bad optimizations, parsing failures, ranking decisions. This tool " +
+      "tells you exactly what instructions the model received and what it produced.\n\n" +
+      "CODEBASE TOOLS (for finding where to fix it):\n" +
+      "• search_code — Ripgrep search across codeflash-internal or codeflash CLI repos. Use AFTER " +
+      "inspecting an LLM call to find the prompt template or pipeline code responsible. Search for " +
+      "distinctive phrases from the prompt to locate the template file.\n" +
+      "• read_file — Read a file with line numbers. Use after search_code to read full context around " +
+      "a match. Also use to read prompt templates (.md files) and pipeline modules.\n" +
+      "• list_directory — List files/dirs in a repo path. Use to orient yourself when navigating " +
+      "unfamiliar parts of the codebase.\n\n" +
      "Key paths in codeflash-internal:\n" +
      "- django/aiservice/core/shared/ — optimizer_router, testgen_router, ranker\n" +
      "- django/aiservice/core/languages/python/optimizer/ — Python optimizer pipeline\n" +
      "- django/aiservice/core/languages/python/testgen/ — test generation pipeline\n" +
      "- django/aiservice/aiservice/llm.py — LLM provider abstraction\n" +
-      "- Prompt templates are .md files alongside their modules (rendered with Jinja2)\n\n" +
-      "=== EXPECTED WORKFLOW — YOU MUST COMPLETE ALL STEPS ===\n" +
-      "When you find a problem in a trace, DO NOT stop at describing the symptoms. You MUST complete " +
-      "the full investigation:\n\n" +
-      "1. OBSERVE: Answer the user's question using trace data tools (get_test_code, get_candidate_code, etc.)\n" +
-      "2. INVESTIGATE: Use get_llm_call_detail to read the prompts and responses that caused the problem. " +
-      "Identify whether the issue is a prompt gap, a model failure to follow instructions, or a pipeline bug.\n" +
-      "3. LOCATE: Use search_code to find the prompt template or pipeline code responsible. Read it with " +
-      "read_file. Prompt templates are .md files — search for distinctive phrases from the prompt you found " +
-      "in step 2 to locate the template file.\n" +
-      "4. RECOMMEND: Suggest a concrete fix — name the file, quote the relevant section, and describe " +
-      "what to change. For example: 'In django/aiservice/core/languages/python/testgen/prompt.md, the " +
-      "no-mocks instruction at line 45 should be moved to the system prompt for stronger enforcement.'\n\n" +
-      "If you skip steps 3-4, your response is INCOMPLETE. The user is a developer who wants actionable " +
-      "fixes, not just observations about what went wrong.\n\n" +
-      "HARD REQUIREMENT: When you identify a problem caused by a prompt or pipeline stage, your response " +
-      "MUST include at least one real file path from the codebase that you found via search_code or " +
-      "read_file. Generic advice like 'strengthen the prompt' is not enough — find the actual file, " +
-      "read it, and reference the specific lines that need to change.",
+      "- Prompt templates are .md files alongside their modules (rendered with Jinja2)",
  )

+  // === SECTION 3: DOMAIN KNOWLEDGE (middle) ===
  lines.push("")
  lines.push("=== CODEFLASH TESTING GUIDELINES ===")
  lines.push(
@ -253,6 +234,7 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
      "the cause before blaming the optimizations.",
  )

+  // === SECTION 4: TRACE OVERVIEW (the data itself) ===
  lines.push("")
  lines.push("=== TRACE OVERVIEW ===")

@ -314,17 +296,33 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
    }
  }

+  // === SECTION 5: WORKFLOW + CHECKLIST (recency — last thing the model sees) ===
  lines.push("")
-  lines.push("=== RESPONSE CHECKLIST (review before responding) ===")
+  lines.push("=== INVESTIGATION WORKFLOW ===")
  lines.push(
-    "Before you send your response, verify:\n" +
-      "[ ] If you identified a problem (bad tests, failed optimization, parsing error, etc.), did you " +
-      "use get_llm_call_detail to read the actual prompt/response that caused it?\n" +
-      "[ ] If the root cause is in a prompt or pipeline, did you use search_code and read_file to " +
-      "find the actual source file? Your response MUST include at least one real file path from the " +
-      "codebase (e.g., 'django/aiservice/core/languages/python/testgen/system_prompt.md').\n" +
-      "[ ] Are your recommendations grounded in specific code you read, not generic advice?\n\n" +
-      "If any box is unchecked, go back and use the tools before responding.",
+    "When the user asks about a problem, follow these steps IN ORDER. Do not skip steps.\n\n" +
+      "1. OBSERVE: Use get_errors and trace data tools to understand what happened.\n" +
+      "2. INVESTIGATE: Use get_llm_call_detail to read the actual prompts and responses that caused the " +
+      "problem. Identify whether the issue is a prompt gap, a model failure, or a pipeline bug.\n" +
+      "3. LOCATE: Use search_code to find the prompt template or pipeline code responsible. Read it with " +
+      "read_file. Prompt templates are .md files — search for distinctive phrases from the prompt you " +
+      "found in step 2 to locate the template file.\n" +
+      "4. RECOMMEND: Name the file, quote the relevant section, describe what to change.\n\n" +
+      "EXAMPLE of a good investigation:\n" +
+      "  User: 'Why did the tests use mocks?'\n" +
+      "  → get_test_code(index=1) → sees Mock objects in generated tests\n" +
+      "  → get_llm_call_detail(call_id='...testgen...') → reads the system prompt, finds no anti-mock instruction\n" +
+      "  → search_code(repo='codeflash-internal', pattern='mock.*MagicMock', glob='*.md') → finds prompt template\n" +
+      "  → read_file(repo='codeflash-internal', path='django/aiservice/.../testgen/system_prompt.md') → reads it\n" +
+      "  → Response: 'The testgen system prompt at django/.../testgen/system_prompt.md:42 does not " +
+      "explicitly forbid mocks. Add an instruction after line 42: \"Never use Mock, MagicMock, patch, " +
+      "or SimpleNamespace. Always construct real instances.\"'\n\n" +
+      "BEFORE YOU RESPOND, verify:\n" +
+      "- Did you use get_llm_call_detail to read the actual prompt/response?\n" +
+      "- Did you use search_code/read_file to find the source file?\n" +
+      "- Does your response include at least one real file path with line numbers?\n" +
+      "- Are your recommendations grounded in code you actually read?\n\n" +
+      "If any answer is no, GO BACK AND USE THE TOOLS. Do not respond with generic advice.",
  )

  return lines.join("\n")
@ -334,14 +332,17 @@ export const anthropicToolDefinitions = [
  {
    name: "get_original_code",
    description:
-      "Returns the original source code of the function being optimized.",
+      "Returns the original source code of the function being optimized. Use this when you need " +
+      "to understand what was being optimized, compare with candidate code, or check if the " +
+      "original function has patterns that explain optimization failures.",
    input_schema: { type: "object" as const, properties: {} },
  },
  {
    name: "get_candidate_code",
    description:
-      "Returns the optimized code, explanation, and metadata for a specific candidate. " +
-      "Use source_type and index to identify the candidate (e.g., source_type='OPTIMIZE', index=1 for the first optimization candidate).",
+      "Returns the optimized code, explanation, model, and rank for a specific candidate. " +
+      "Use this when investigating why a candidate was ranked low, failed tests, or produced " +
+      "incorrect output. Call this for multiple candidates to compare approaches.",
    input_schema: {
      type: "object" as const,
      properties: {
@ -361,8 +362,10 @@ export const anthropicToolDefinitions = [
  {
    name: "get_test_code",
    description:
-      "Returns the generated test code for a specific test group, including which model generated it. " +
-      "Each test group may have generated, instrumented, and performance-instrumented variants.",
+      "Returns the generated test code for a specific test group with model info. " +
+      "Use this when investigating test quality issues — look for Mock, MagicMock, patch, " +
+      "SimpleNamespace, or other fake objects. Also use when candidates fail tests to check " +
+      "whether the tests themselves are the problem.",
    input_schema: {
      type: "object" as const,
      properties: {
@ -377,20 +380,28 @@ export const anthropicToolDefinitions = [
  {
    name: "get_ranking_details",
    description:
-      "Returns the full ranking explanation and the ordered list of candidates.",
+      "Returns the full ranking explanation and ordered candidate list. Use this when " +
+      "investigating why certain candidates were preferred, why good-looking code was ranked low, " +
+      "or why the ranking disagrees with expected results.",
    input_schema: { type: "object" as const, properties: {} },
  },
  {
    name: "get_errors",
    description:
-      "Returns all errors encountered during the optimization, including test failures and their context.",
+      "Returns ALL errors from the optimization: test failures, parsing errors, pipeline errors, " +
+      "and their context. This should be one of your FIRST calls when investigating any failure — " +
+      "it tells you what went wrong before you dive into why.",
    input_schema: { type: "object" as const, properties: {} },
  },
  {
    name: "get_llm_call_detail",
    description:
-      "Returns the full prompt, response, and parsing results for a specific LLM call. " +
-      "Use the call_id from the LLM call IDs listed in the trace overview.",
+      "YOUR PRIMARY DEBUGGING TOOL. Fetches the full system prompt, user prompt, raw LLM " +
+      "response, parsed response, and parsing results for any LLM call in this trace. " +
+      "Use this FIRST when investigating any LLM-related issue: bad tests, bad optimizations, " +
+      "parsing failures, ranking decisions. This tells you exactly what instructions the model " +
+      "received and what it produced. After reading the prompt, use search_code to find the " +
+      "template file and suggest a fix.",
    input_schema: {
      type: "object" as const,
      properties: {
@ -405,8 +416,10 @@ export const anthropicToolDefinitions = [
  {
    name: "read_file",
    description:
-      "Read a file from the codeflash-internal or codeflash CLI repository. " +
-      "Returns file content with line numbers. Use start_line/end_line for large files.",
+      "Read a file from the codeflash-internal or codeflash CLI repository with line numbers. " +
+      "Use this AFTER search_code to read full context around a match. Essential for reading " +
+      "prompt templates (.md files) and pipeline code. Your investigation is incomplete until " +
+      "you've read the relevant source file.",
    input_schema: {
      type: "object" as const,
      properties: {
@ -434,7 +447,10 @@ export const anthropicToolDefinitions = [
  {
    name: "search_code",
    description:
-      "Search for a pattern across a repository using ripgrep. Returns matching lines with file paths and line numbers.",
+      "Ripgrep search across a repository. Returns matching lines with file paths and line " +
+      "numbers. Use this AFTER get_llm_call_detail to find the prompt template or pipeline " +
+      "code responsible for a problem. Search for distinctive phrases from the prompt to locate " +
+      "the template file. Also use to find where specific pipeline stages are implemented.",
    input_schema: {
      type: "object" as const,
      properties: {
@ -445,11 +461,11 @@ export const anthropicToolDefinitions = [
        },
        pattern: {
          type: "string",
-          description: "Regex pattern to search for (ripgrep syntax).",
+          description: "Regex pattern to search for (ripgrep syntax). Be specific — use distinctive phrases.",
        },
        glob: {
          type: "string",
-          description: "File glob filter (e.g., '*.py', '*.ts').",
+          description: "File glob filter (e.g., '*.py', '*.md', '*.ts'). Use '*.md' for prompt templates.",
        },
        max_results: {
          type: "number",
@ -462,7 +478,9 @@ export const anthropicToolDefinitions = [
  {
    name: "list_directory",
    description:
-      "List files and directories in a repository path. Directories are listed first, sorted alphabetically.",
+      "List files and directories in a repository path. Use this to orient yourself when " +
+      "navigating unfamiliar parts of the codebase, or to discover prompt template files " +
+      "alongside pipeline modules.",
    input_schema: {
      type: "object" as const,
      properties: {