feat: add tool activity display and fix streaming timeout in observability chat

Restructure agent loop to use stream()+finalMessage() for all API calls,
fixing the SDK's non-streaming timeout error with max_tokens 32k. Add
parallel tool execution, tool activity bubbles in the frontend, and
restructure the system prompt for better investigation behavior.
This commit is contained in:
Kevin Turcios 2026-02-15 01:29:45 -05:00
parent 51372ca0ad
commit b09262ccbc
3 changed files with 332 additions and 132 deletions

View file

@ -17,6 +17,18 @@ interface ChatMessage {
const MAX_TOOL_ROUNDS = 15
const KEEPALIVE_INTERVAL_MS = 15_000
const TOOL_DISPLAY_NAMES: Record<string, string> = {
get_original_code: "Reading original code",
get_candidate_code: "Reading candidate code",
get_test_code: "Reading test code",
get_ranking_details: "Reading ranking details",
get_errors: "Checking errors",
get_llm_call_detail: "Inspecting LLM call",
read_file: "Reading file",
search_code: "Searching codebase",
list_directory: "Listing directory",
}
function getClient(): Anthropic {
const baseURL = process.env.ANTHROPIC_FOUNDRY_BASE_URL
const apiKey = process.env.AZURE_OPENAI_API_KEY
@ -26,26 +38,99 @@ function getClient(): Anthropic {
return new Anthropic({ baseURL, apiKey })
}
async function processToolUseResponse(
response: Anthropic.Message,
indexed: IndexedTraceData
function summarizeToolResult(toolName: string, result: string): string {
const lines = result.split("\n").filter(Boolean)
switch (toolName) {
case "search_code": {
if (result === "No matches found.") return "No matches found"
return `Found ${lines.length} matching lines`
}
case "read_file": {
const header = lines[0] ?? ""
return header.startsWith("File:") ? header : `Read ${lines.length} lines`
}
case "get_errors": {
if (result === "No errors in this trace.") return "No errors"
const count = lines.filter((l) => l.startsWith("[")).length
return `Found ${count} errors`
}
case "get_llm_call_detail":
return "Loaded full LLM call details"
case "list_directory": {
const count = lines.length - 1 // subtract header line
return `Listed ${count} entries`
}
default:
return "Done"
}
}
async function processToolCalls(
content: Anthropic.ContentBlock[],
indexed: IndexedTraceData,
enqueue: (data: string) => void,
): Promise<Anthropic.ToolResultBlockParam[]> {
const toolResults: Anthropic.ToolResultBlockParam[] = []
for (const block of response.content) {
if (block.type === "tool_use") {
const toolUseBlocks = content.filter(
(block): block is Anthropic.ToolUseBlock => block.type === "tool_use",
)
if (toolUseBlocks.length === 0) return []
// Emit tool_start events for all tools
for (const block of toolUseBlocks) {
enqueue(
`data: ${JSON.stringify({
type: "tool_start",
tool: block.name,
displayName: TOOL_DISPLAY_NAMES[block.name] ?? block.name,
})}\n\n`,
)
}
// Execute all tool calls in parallel
const results = await Promise.all(
toolUseBlocks.map(async (block) => {
const result = await resolveToolCall(
block.name,
(block.input as Record<string, unknown>) ?? {},
indexed
indexed,
)
toolResults.push({
type: "tool_result",
enqueue(
`data: ${JSON.stringify({
type: "tool_result",
tool: block.name,
displayName: TOOL_DISPLAY_NAMES[block.name] ?? block.name,
summary: summarizeToolResult(block.name, result),
})}\n\n`,
)
return {
type: "tool_result" as const,
tool_use_id: block.id,
content: result,
})
}
}
}),
)
return results
}
// Shared base params for both streaming and non-streaming calls.
// Adaptive thinking lets Claude decide how much reasoning is needed per request.
// On Opus 4.6, this automatically enables interleaved thinking (thinking between
// tool calls) without needing a beta header.
function baseParams(
systemPrompt: string,
conversationMessages: Anthropic.MessageParam[],
) {
return {
model: "claude-opus-4-6" as const,
max_tokens: 32_000,
system: systemPrompt,
messages: conversationMessages,
tools: anthropicToolDefinitions as Anthropic.Tool[],
thinking: { type: "adaptive" as const },
}
return toolResults
}
export async function POST(request: NextRequest): Promise<Response> {
@ -55,7 +140,7 @@ export async function POST(request: NextRequest): Promise<Response> {
} catch (err) {
return Response.json(
{ error: err instanceof Error ? err.message : "Configuration error" },
{ status: 500 }
{ status: 500 },
)
}
@ -70,7 +155,7 @@ export async function POST(request: NextRequest): Promise<Response> {
if (!traceId || !messages?.length) {
return Response.json(
{ error: "traceId and messages are required" },
{ status: 400 }
{ status: 400 },
)
}
@ -82,7 +167,6 @@ export async function POST(request: NextRequest): Promise<Response> {
const indexed = indexTraceData(traceData)
const systemPrompt = buildSummaryPrompt(indexed)
const conversationMessages: Anthropic.MessageParam[] = messages.map((m) => ({
role: m.role,
content: m.content,
@ -96,59 +180,44 @@ export async function POST(request: NextRequest): Promise<Response> {
const keepalive = setInterval(() => enqueue(": keepalive\n\n"), KEEPALIVE_INTERVAL_MS)
try {
// Tool resolution loop — sends keepalive pings to prevent gateway timeout
// Tool resolution loop — uses stream() + finalMessage() to avoid the SDK's
// non-streaming timeout limit (max_tokens 32k estimates >10min, which blocks
// client.messages.create()). No text handler is attached, so no text reaches
// the frontend — avoiding "stutter" from intermediate fragments.
let toolRounds = 0
while (toolRounds < MAX_TOOL_ROUNDS) {
const response = await client.messages.create({
model: "claude-opus-4-6",
max_tokens: 4096,
system: systemPrompt,
messages: conversationMessages,
tools: anthropicToolDefinitions as Anthropic.Tool[],
})
const toolStream = client.messages.stream(baseParams(systemPrompt, conversationMessages))
const response = await toolStream.finalMessage()
if (response.stop_reason !== "tool_use") {
break
}
if (response.stop_reason !== "tool_use") break
conversationMessages.push({ role: "assistant", content: response.content })
const toolResults = await processToolUseResponse(response, indexed)
const toolResults = await processToolCalls(response.content, indexed, enqueue)
conversationMessages.push({ role: "user", content: toolResults })
toolRounds++
}
// Stream the final response
const messageStream = client.messages.stream({
model: "claude-opus-4-6",
max_tokens: 4096,
system: systemPrompt,
messages: conversationMessages,
tools: anthropicToolDefinitions as Anthropic.Tool[],
})
// Stream the final response — this is the only text the user sees
const messageStream = client.messages.stream(baseParams(systemPrompt, conversationMessages))
messageStream.on("text", (textDelta) => {
enqueue(`data: ${JSON.stringify({ text: textDelta })}\n\n`)
enqueue(`data: ${JSON.stringify({ type: "text", text: textDelta })}\n\n`)
})
const finalMessage = await messageStream.finalMessage()
// Edge case: final streaming response ended with tool_use. Process tools
// and make one more streaming call without tools to get a text response.
if (finalMessage.stop_reason === "tool_use") {
conversationMessages.push({ role: "assistant", content: finalMessage.content })
const toolResults = await processToolUseResponse(finalMessage, indexed)
const toolResults = await processToolCalls(finalMessage.content, indexed, enqueue)
conversationMessages.push({ role: "user", content: toolResults })
const followUpStream = client.messages.stream({
model: "claude-opus-4-6",
max_tokens: 4096,
system: systemPrompt,
messages: conversationMessages,
const followUp = client.messages.stream(baseParams(systemPrompt, conversationMessages))
followUp.on("text", (textDelta) => {
enqueue(`data: ${JSON.stringify({ type: "text", text: textDelta })}\n\n`)
})
followUpStream.on("text", (textDelta) => {
enqueue(`data: ${JSON.stringify({ text: textDelta })}\n\n`)
})
await followUpStream.finalMessage()
await followUp.finalMessage()
}
enqueue("data: [DONE]\n\n")

View file

@ -11,6 +11,13 @@ interface ChatMessage {
content: string
}
interface ToolStep {
tool: string
displayName: string
status: "running" | "done"
summary?: string
}
interface TimelineChatProps {
traceId: string
isOpen: boolean
@ -25,6 +32,8 @@ export const TimelineChat = memo(function TimelineChat({
const [messages, setMessages] = useState<ChatMessage[]>([])
const [input, setInput] = useState("")
const [isStreaming, setIsStreaming] = useState(false)
const [completedRounds, setCompletedRounds] = useState<ToolStep[][]>([])
const [activeSteps, setActiveSteps] = useState<ToolStep[]>([])
const messagesEndRef = useRef<HTMLDivElement>(null)
const inputRef = useRef<HTMLTextAreaElement>(null)
const abortRef = useRef<AbortController | null>(null)
@ -35,7 +44,7 @@ export const TimelineChat = memo(function TimelineChat({
useEffect(() => {
scrollToBottom()
}, [messages, scrollToBottom])
}, [messages, completedRounds, activeSteps, scrollToBottom])
useEffect(() => {
if (isOpen) {
@ -57,6 +66,8 @@ export const TimelineChat = memo(function TimelineChat({
abortRef.current = controller
setMessages((prev) => [...prev, { role: "assistant", content: "" }])
setCompletedRounds([])
setActiveSteps([])
try {
const res = await fetch("/api/observability/chat", {
@ -92,14 +103,52 @@ export const TimelineChat = memo(function TimelineChat({
try {
const parsed = JSON.parse(data)
if (parsed.text) {
// Handle typed events (new protocol)
if (parsed.type === "tool_start") {
setActiveSteps((prev) => {
// If all previous steps are done, this is a new round — commit previous steps
if (prev.length > 0 && prev.every((s) => s.status === "done")) {
setCompletedRounds((rounds) => [...rounds, prev])
return [{
tool: parsed.tool,
displayName: parsed.displayName ?? parsed.tool,
status: "running",
}]
}
return [
...prev,
{
tool: parsed.tool,
displayName: parsed.displayName ?? parsed.tool,
status: "running",
},
]
})
continue
}
if (parsed.type === "tool_result") {
setActiveSteps((prev) =>
prev.map((step) =>
step.tool === parsed.tool && step.status === "running"
? { ...step, status: "done", summary: parsed.summary }
: step,
),
)
continue
}
// Handle text — both new {type: "text", text} and old {text} formats
const textContent = parsed.type === "text" ? parsed.text : parsed.text
if (textContent) {
setMessages((prev) => {
const updated = [...prev]
const last = updated[updated.length - 1]
if (last?.role === "assistant") {
updated[updated.length - 1] = {
...last,
content: last.content + parsed.text,
content: last.content + textContent,
}
}
return updated
@ -129,6 +178,13 @@ export const TimelineChat = memo(function TimelineChat({
return updated
})
} finally {
// Commit any remaining active steps as a final completed round
setActiveSteps((prev) => {
if (prev.length > 0) {
setCompletedRounds((rounds) => [...rounds, prev])
}
return []
})
setIsStreaming(false)
abortRef.current = null
}
@ -208,6 +264,13 @@ export const TimelineChat = memo(function TimelineChat({
{messages.map((msg, i) => (
<ChatBubble key={i} message={msg} isStreaming={isStreaming && i === messages.length - 1} />
))}
{completedRounds.map((steps, i) => (
<ToolRoundBubble key={`round-${i}`} steps={steps} />
))}
{activeSteps.length > 0 && <ToolRoundBubble steps={activeSteps} isActive />}
<div ref={messagesEndRef} />
</div>
@ -253,6 +316,56 @@ export const TimelineChat = memo(function TimelineChat({
)
})
const ToolRoundBubble = memo(function ToolRoundBubble({
steps,
isActive = false,
}: {
steps: ToolStep[]
isActive?: boolean
}) {
const allDone = steps.every((s) => s.status === "done")
return (
<div className="flex gap-3">
<div
className={`flex-shrink-0 w-7 h-7 rounded-full flex items-center justify-center ${
isActive && !allDone
? "bg-amber-100 dark:bg-amber-900/40"
: "bg-zinc-100 dark:bg-zinc-800"
}`}
>
{isActive && !allDone ? (
<Loader2 className="h-3.5 w-3.5 animate-spin text-amber-600 dark:text-amber-400" />
) : (
<Check className="h-3.5 w-3.5 text-green-500" />
)}
</div>
<div className="max-w-[85%] rounded-lg px-3 py-2 bg-zinc-50 dark:bg-zinc-800/50 border border-zinc-200 dark:border-zinc-700">
<div className="flex flex-col gap-1">
{steps.map((step, i) => (
<div
key={`${step.tool}-${i}`}
className="flex items-center gap-2 text-xs text-zinc-500 dark:text-zinc-400"
>
{step.status === "running" ? (
<Loader2 className="h-3 w-3 animate-spin flex-shrink-0 text-amber-500" />
) : (
<Check className="h-3 w-3 text-green-500 flex-shrink-0" />
)}
<span>
{step.displayName}
{step.summary && (
<span className="text-zinc-400 dark:text-zinc-500"> {step.summary}</span>
)}
</span>
</div>
))}
</div>
</div>
</div>
)
})
const ChatBubble = memo(function ChatBubble({
message,
isStreaming,

View file

@ -168,75 +168,56 @@ function truncate(s: string, max: number): string {
export function buildSummaryPrompt(data: IndexedTraceData): string {
const lines: string[] = []
// === SECTION 1: ROLE + CRITICAL BEHAVIOR (primacy — first thing the model sees) ===
lines.push(
"You are an assistant helping a developer understand an optimization trace from Codeflash. " +
"You have tools to fetch specific data from the trace on demand. Use them to answer the user's " +
"questions — only fetch what you need. Be concise and reference specific candidates when relevant.\n\n" +
"IMPORTANT: The user may paste errors, warnings, logs, or other output from their CLI runs that " +
"are NOT captured in the trace data you have access to. The trace data is incomplete — it does not " +
"record everything that happens during an optimization run. When the user shares information, trust " +
"it as ground truth even if the trace data doesn't confirm it. Never say 'the trace shows no errors' " +
"to contradict what the user is reporting. Instead, use the trace data you DO have to help explain " +
"and investigate what they're seeing.\n\n" +
"You also have codebase browsing tools (read_file, search_code, list_directory) that let you " +
"navigate the codeflash-internal and codeflash CLI source code, and a get_llm_call_detail tool " +
"to inspect the full prompts and responses of any LLM call in this trace. Use these to trace " +
"problems end-to-end: see what went wrong in the trace, read the actual prompts sent, then " +
"navigate to the pipeline code to suggest a concrete fix.",
"You are an investigation agent for Codeflash optimization traces. Your job is to diagnose " +
"problems, trace them to root causes in the code, and recommend specific fixes.\n\n" +
"CRITICAL RULES:\n" +
"1. You MUST use tools before answering any diagnostic question. Never speculate when you can look.\n" +
"2. Use tools liberally. It is always better to over-investigate than to give a shallow answer.\n" +
"3. When multiple tool calls are independent, call them in parallel to save time.\n" +
"4. Trust user-provided information (CLI output, errors, logs) as ground truth, even when trace " +
"data doesn't confirm it. The trace is incomplete — it does not capture everything.\n" +
"5. Your output should be concise and reference specific candidates, file paths, and line numbers.",
)
// === SECTION 2: TOOL REFERENCE (when to use each) ===
lines.push("")
lines.push("=== DEBUGGING TOOLS — USE THESE PROACTIVELY ===")
lines.push("=== TOOL REFERENCE ===")
lines.push(
"You have tools beyond trace data. Your job is not just to describe what happened — it's to " +
"investigate WHY it happened and point to the specific code or prompt that needs to change. " +
"Always go one level deeper than the surface-level observation.\n\n" +
"IMPORTANT: When you identify a problem (bad tests, failed optimizations, parsing errors, etc.), " +
"you MUST use get_llm_call_detail to inspect the actual prompts and responses involved. Then, if " +
"the issue traces back to a prompt or pipeline bug, use the codebase browsing tools to find the " +
"source code and suggest a concrete fix. Do not stop at 'the tests used mocks' — find out what " +
"prompt instructions led to that and where to fix them.\n\n" +
"=== get_llm_call_detail(call_id) ===\n" +
"Fetches the full system prompt, user prompt, raw LLM response, and parsing results for any " +
"LLM call in this trace. You SHOULD use this:\n" +
"- When analyzing test quality: inspect the testgen prompt to see what instructions the model " +
"received. Did the prompt forbid mocks? Did it provide enough context about the classes?\n" +
"- When investigating bad optimizations: read the optimizer prompt to check if context was " +
"missing or if instructions were unclear\n" +
"- When debugging parsing failures: compare raw_response vs parsed_response to find extraction bugs\n" +
"- When understanding ranking decisions: read the ranker prompt and response\n\n" +
"=== read_file, search_code, list_directory ===\n" +
"Browse the codeflash-internal and codeflash (CLI) source repos. You SHOULD use these:\n" +
"- After inspecting an LLM call, find the prompt template to suggest a specific fix\n" +
"- To understand how a pipeline stage works (postprocessing, deduplication, instrumentation)\n" +
"- To trace a code path from an LLM call back to the pipeline logic that invoked it\n" +
"- When the user asks 'where does X happen' or 'why does Y behave this way'\n\n" +
"TRACE DATA TOOLS (for understanding what happened):\n" +
"• get_errors — Start here. Shows all errors from the optimization (test failures, parsing errors). " +
"Use this first to understand what went wrong before diving deeper.\n" +
"• get_original_code — Returns the original function source. Use when you need to understand what " +
"was being optimized.\n" +
"• get_candidate_code — Returns a specific optimization candidate with its explanation, model, and " +
"rank. Use when investigating why a candidate was ranked low or failed.\n" +
"• get_test_code — Returns generated/instrumented test code. Use when investigating test quality " +
"issues (mocks, bad assertions, missing coverage).\n" +
"• get_ranking_details — Returns the full ranking explanation. Use when investigating why candidates " +
"were ranked the way they were.\n\n" +
"DEBUGGING TOOLS (for understanding WHY it happened):\n" +
"• get_llm_call_detail — YOUR PRIMARY DEBUGGING TOOL. Fetches the full system prompt, user prompt, " +
"raw response, and parsing results for any LLM call. Use this FIRST when investigating any " +
"LLM-related issue: bad tests, bad optimizations, parsing failures, ranking decisions. This tool " +
"tells you exactly what instructions the model received and what it produced.\n\n" +
"CODEBASE TOOLS (for finding where to fix it):\n" +
"• search_code — Ripgrep search across codeflash-internal or codeflash CLI repos. Use AFTER " +
"inspecting an LLM call to find the prompt template or pipeline code responsible. Search for " +
"distinctive phrases from the prompt to locate the template file.\n" +
"• read_file — Read a file with line numbers. Use after search_code to read full context around " +
"a match. Also use to read prompt templates (.md files) and pipeline modules.\n" +
"• list_directory — List files/dirs in a repo path. Use to orient yourself when navigating " +
"unfamiliar parts of the codebase.\n\n" +
"Key paths in codeflash-internal:\n" +
"- django/aiservice/core/shared/ — optimizer_router, testgen_router, ranker\n" +
"- django/aiservice/core/languages/python/optimizer/ — Python optimizer pipeline\n" +
"- django/aiservice/core/languages/python/testgen/ — test generation pipeline\n" +
"- django/aiservice/aiservice/llm.py — LLM provider abstraction\n" +
"- Prompt templates are .md files alongside their modules (rendered with Jinja2)\n\n" +
"=== EXPECTED WORKFLOW — YOU MUST COMPLETE ALL STEPS ===\n" +
"When you find a problem in a trace, DO NOT stop at describing the symptoms. You MUST complete " +
"the full investigation:\n\n" +
"1. OBSERVE: Answer the user's question using trace data tools (get_test_code, get_candidate_code, etc.)\n" +
"2. INVESTIGATE: Use get_llm_call_detail to read the prompts and responses that caused the problem. " +
"Identify whether the issue is a prompt gap, a model failure to follow instructions, or a pipeline bug.\n" +
"3. LOCATE: Use search_code to find the prompt template or pipeline code responsible. Read it with " +
"read_file. Prompt templates are .md files — search for distinctive phrases from the prompt you found " +
"in step 2 to locate the template file.\n" +
"4. RECOMMEND: Suggest a concrete fix — name the file, quote the relevant section, and describe " +
"what to change. For example: 'In django/aiservice/core/languages/python/testgen/prompt.md, the " +
"no-mocks instruction at line 45 should be moved to the system prompt for stronger enforcement.'\n\n" +
"If you skip steps 3-4, your response is INCOMPLETE. The user is a developer who wants actionable " +
"fixes, not just observations about what went wrong.\n\n" +
"HARD REQUIREMENT: When you identify a problem caused by a prompt or pipeline stage, your response " +
"MUST include at least one real file path from the codebase that you found via search_code or " +
"read_file. Generic advice like 'strengthen the prompt' is not enough — find the actual file, " +
"read it, and reference the specific lines that need to change.",
"- Prompt templates are .md files alongside their modules (rendered with Jinja2)",
)
// === SECTION 3: DOMAIN KNOWLEDGE (middle) ===
lines.push("")
lines.push("=== CODEFLASH TESTING GUIDELINES ===")
lines.push(
@ -253,6 +234,7 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
"the cause before blaming the optimizations.",
)
// === SECTION 4: TRACE OVERVIEW (the data itself) ===
lines.push("")
lines.push("=== TRACE OVERVIEW ===")
@ -314,17 +296,33 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
}
}
// === SECTION 5: WORKFLOW + CHECKLIST (recency — last thing the model sees) ===
lines.push("")
lines.push("=== RESPONSE CHECKLIST (review before responding) ===")
lines.push("=== INVESTIGATION WORKFLOW ===")
lines.push(
"Before you send your response, verify:\n" +
"[ ] If you identified a problem (bad tests, failed optimization, parsing error, etc.), did you " +
"use get_llm_call_detail to read the actual prompt/response that caused it?\n" +
"[ ] If the root cause is in a prompt or pipeline, did you use search_code and read_file to " +
"find the actual source file? Your response MUST include at least one real file path from the " +
"codebase (e.g., 'django/aiservice/core/languages/python/testgen/system_prompt.md').\n" +
"[ ] Are your recommendations grounded in specific code you read, not generic advice?\n\n" +
"If any box is unchecked, go back and use the tools before responding.",
"When the user asks about a problem, follow these steps IN ORDER. Do not skip steps.\n\n" +
"1. OBSERVE: Use get_errors and trace data tools to understand what happened.\n" +
"2. INVESTIGATE: Use get_llm_call_detail to read the actual prompts and responses that caused the " +
"problem. Identify whether the issue is a prompt gap, a model failure, or a pipeline bug.\n" +
"3. LOCATE: Use search_code to find the prompt template or pipeline code responsible. Read it with " +
"read_file. Prompt templates are .md files — search for distinctive phrases from the prompt you " +
"found in step 2 to locate the template file.\n" +
"4. RECOMMEND: Name the file, quote the relevant section, describe what to change.\n\n" +
"EXAMPLE of a good investigation:\n" +
" User: 'Why did the tests use mocks?'\n" +
" → get_test_code(index=1) → sees Mock objects in generated tests\n" +
" → get_llm_call_detail(call_id='...testgen...') → reads the system prompt, finds no anti-mock instruction\n" +
" → search_code(repo='codeflash-internal', pattern='mock.*MagicMock', glob='*.md') → finds prompt template\n" +
" → read_file(repo='codeflash-internal', path='django/aiservice/.../testgen/system_prompt.md') → reads it\n" +
" → Response: 'The testgen system prompt at django/.../testgen/system_prompt.md:42 does not " +
"explicitly forbid mocks. Add an instruction after line 42: \"Never use Mock, MagicMock, patch, " +
"or SimpleNamespace. Always construct real instances.\"'\n\n" +
"BEFORE YOU RESPOND, verify:\n" +
"- Did you use get_llm_call_detail to read the actual prompt/response?\n" +
"- Did you use search_code/read_file to find the source file?\n" +
"- Does your response include at least one real file path with line numbers?\n" +
"- Are your recommendations grounded in code you actually read?\n\n" +
"If any answer is no, GO BACK AND USE THE TOOLS. Do not respond with generic advice.",
)
return lines.join("\n")
@ -334,14 +332,17 @@ export const anthropicToolDefinitions = [
{
name: "get_original_code",
description:
"Returns the original source code of the function being optimized.",
"Returns the original source code of the function being optimized. Use this when you need " +
"to understand what was being optimized, compare with candidate code, or check if the " +
"original function has patterns that explain optimization failures.",
input_schema: { type: "object" as const, properties: {} },
},
{
name: "get_candidate_code",
description:
"Returns the optimized code, explanation, and metadata for a specific candidate. " +
"Use source_type and index to identify the candidate (e.g., source_type='OPTIMIZE', index=1 for the first optimization candidate).",
"Returns the optimized code, explanation, model, and rank for a specific candidate. " +
"Use this when investigating why a candidate was ranked low, failed tests, or produced " +
"incorrect output. Call this for multiple candidates to compare approaches.",
input_schema: {
type: "object" as const,
properties: {
@ -361,8 +362,10 @@ export const anthropicToolDefinitions = [
{
name: "get_test_code",
description:
"Returns the generated test code for a specific test group, including which model generated it. " +
"Each test group may have generated, instrumented, and performance-instrumented variants.",
"Returns the generated test code for a specific test group with model info. " +
"Use this when investigating test quality issues — look for Mock, MagicMock, patch, " +
"SimpleNamespace, or other fake objects. Also use when candidates fail tests to check " +
"whether the tests themselves are the problem.",
input_schema: {
type: "object" as const,
properties: {
@ -377,20 +380,28 @@ export const anthropicToolDefinitions = [
{
name: "get_ranking_details",
description:
"Returns the full ranking explanation and the ordered list of candidates.",
"Returns the full ranking explanation and ordered candidate list. Use this when " +
"investigating why certain candidates were preferred, why good-looking code was ranked low, " +
"or why the ranking disagrees with expected results.",
input_schema: { type: "object" as const, properties: {} },
},
{
name: "get_errors",
description:
"Returns all errors encountered during the optimization, including test failures and their context.",
"Returns ALL errors from the optimization: test failures, parsing errors, pipeline errors, " +
"and their context. This should be one of your FIRST calls when investigating any failure — " +
"it tells you what went wrong before you dive into why.",
input_schema: { type: "object" as const, properties: {} },
},
{
name: "get_llm_call_detail",
description:
"Returns the full prompt, response, and parsing results for a specific LLM call. " +
"Use the call_id from the LLM call IDs listed in the trace overview.",
"YOUR PRIMARY DEBUGGING TOOL. Fetches the full system prompt, user prompt, raw LLM " +
"response, parsed response, and parsing results for any LLM call in this trace. " +
"Use this FIRST when investigating any LLM-related issue: bad tests, bad optimizations, " +
"parsing failures, ranking decisions. This tells you exactly what instructions the model " +
"received and what it produced. After reading the prompt, use search_code to find the " +
"template file and suggest a fix.",
input_schema: {
type: "object" as const,
properties: {
@ -405,8 +416,10 @@ export const anthropicToolDefinitions = [
{
name: "read_file",
description:
"Read a file from the codeflash-internal or codeflash CLI repository. " +
"Returns file content with line numbers. Use start_line/end_line for large files.",
"Read a file from the codeflash-internal or codeflash CLI repository with line numbers. " +
"Use this AFTER search_code to read full context around a match. Essential for reading " +
"prompt templates (.md files) and pipeline code. Your investigation is incomplete until " +
"you've read the relevant source file.",
input_schema: {
type: "object" as const,
properties: {
@ -434,7 +447,10 @@ export const anthropicToolDefinitions = [
{
name: "search_code",
description:
"Search for a pattern across a repository using ripgrep. Returns matching lines with file paths and line numbers.",
"Ripgrep search across a repository. Returns matching lines with file paths and line " +
"numbers. Use this AFTER get_llm_call_detail to find the prompt template or pipeline " +
"code responsible for a problem. Search for distinctive phrases from the prompt to locate " +
"the template file. Also use to find where specific pipeline stages are implemented.",
input_schema: {
type: "object" as const,
properties: {
@ -445,11 +461,11 @@ export const anthropicToolDefinitions = [
},
pattern: {
type: "string",
description: "Regex pattern to search for (ripgrep syntax).",
description: "Regex pattern to search for (ripgrep syntax). Be specific — use distinctive phrases.",
},
glob: {
type: "string",
description: "File glob filter (e.g., '*.py', '*.ts').",
description: "File glob filter (e.g., '*.py', '*.md', '*.ts'). Use '*.md' for prompt templates.",
},
max_results: {
type: "number",
@ -462,7 +478,9 @@ export const anthropicToolDefinitions = [
{
name: "list_directory",
description:
"List files and directories in a repository path. Directories are listed first, sorted alphabetically.",
"List files and directories in a repository path. Use this to orient yourself when " +
"navigating unfamiliar parts of the codebase, or to discover prompt template files " +
"alongside pipeline modules.",
input_schema: {
type: "object" as const,
properties: {