mirror of
https://github.com/codeflash-ai/codeflash-internal.git
synced 2026-05-04 18:25:18 +00:00
feat: add tool activity display and fix streaming timeout in observability chat
Restructure agent loop to use stream()+finalMessage() for all API calls, fixing the SDK's non-streaming timeout error with max_tokens 32k. Add parallel tool execution, tool activity bubbles in the frontend, and restructure the system prompt for better investigation behavior.
This commit is contained in:
parent
51372ca0ad
commit
b09262ccbc
3 changed files with 332 additions and 132 deletions
|
|
@ -17,6 +17,18 @@ interface ChatMessage {
|
|||
const MAX_TOOL_ROUNDS = 15
|
||||
const KEEPALIVE_INTERVAL_MS = 15_000
|
||||
|
||||
const TOOL_DISPLAY_NAMES: Record<string, string> = {
|
||||
get_original_code: "Reading original code",
|
||||
get_candidate_code: "Reading candidate code",
|
||||
get_test_code: "Reading test code",
|
||||
get_ranking_details: "Reading ranking details",
|
||||
get_errors: "Checking errors",
|
||||
get_llm_call_detail: "Inspecting LLM call",
|
||||
read_file: "Reading file",
|
||||
search_code: "Searching codebase",
|
||||
list_directory: "Listing directory",
|
||||
}
|
||||
|
||||
function getClient(): Anthropic {
|
||||
const baseURL = process.env.ANTHROPIC_FOUNDRY_BASE_URL
|
||||
const apiKey = process.env.AZURE_OPENAI_API_KEY
|
||||
|
|
@ -26,26 +38,99 @@ function getClient(): Anthropic {
|
|||
return new Anthropic({ baseURL, apiKey })
|
||||
}
|
||||
|
||||
async function processToolUseResponse(
|
||||
response: Anthropic.Message,
|
||||
indexed: IndexedTraceData
|
||||
function summarizeToolResult(toolName: string, result: string): string {
|
||||
const lines = result.split("\n").filter(Boolean)
|
||||
switch (toolName) {
|
||||
case "search_code": {
|
||||
if (result === "No matches found.") return "No matches found"
|
||||
return `Found ${lines.length} matching lines`
|
||||
}
|
||||
case "read_file": {
|
||||
const header = lines[0] ?? ""
|
||||
return header.startsWith("File:") ? header : `Read ${lines.length} lines`
|
||||
}
|
||||
case "get_errors": {
|
||||
if (result === "No errors in this trace.") return "No errors"
|
||||
const count = lines.filter((l) => l.startsWith("[")).length
|
||||
return `Found ${count} errors`
|
||||
}
|
||||
case "get_llm_call_detail":
|
||||
return "Loaded full LLM call details"
|
||||
case "list_directory": {
|
||||
const count = lines.length - 1 // subtract header line
|
||||
return `Listed ${count} entries`
|
||||
}
|
||||
default:
|
||||
return "Done"
|
||||
}
|
||||
}
|
||||
|
||||
async function processToolCalls(
|
||||
content: Anthropic.ContentBlock[],
|
||||
indexed: IndexedTraceData,
|
||||
enqueue: (data: string) => void,
|
||||
): Promise<Anthropic.ToolResultBlockParam[]> {
|
||||
const toolResults: Anthropic.ToolResultBlockParam[] = []
|
||||
for (const block of response.content) {
|
||||
if (block.type === "tool_use") {
|
||||
const toolUseBlocks = content.filter(
|
||||
(block): block is Anthropic.ToolUseBlock => block.type === "tool_use",
|
||||
)
|
||||
if (toolUseBlocks.length === 0) return []
|
||||
|
||||
// Emit tool_start events for all tools
|
||||
for (const block of toolUseBlocks) {
|
||||
enqueue(
|
||||
`data: ${JSON.stringify({
|
||||
type: "tool_start",
|
||||
tool: block.name,
|
||||
displayName: TOOL_DISPLAY_NAMES[block.name] ?? block.name,
|
||||
})}\n\n`,
|
||||
)
|
||||
}
|
||||
|
||||
// Execute all tool calls in parallel
|
||||
const results = await Promise.all(
|
||||
toolUseBlocks.map(async (block) => {
|
||||
const result = await resolveToolCall(
|
||||
block.name,
|
||||
(block.input as Record<string, unknown>) ?? {},
|
||||
indexed
|
||||
indexed,
|
||||
)
|
||||
toolResults.push({
|
||||
type: "tool_result",
|
||||
|
||||
enqueue(
|
||||
`data: ${JSON.stringify({
|
||||
type: "tool_result",
|
||||
tool: block.name,
|
||||
displayName: TOOL_DISPLAY_NAMES[block.name] ?? block.name,
|
||||
summary: summarizeToolResult(block.name, result),
|
||||
})}\n\n`,
|
||||
)
|
||||
|
||||
return {
|
||||
type: "tool_result" as const,
|
||||
tool_use_id: block.id,
|
||||
content: result,
|
||||
})
|
||||
}
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
||||
return results
|
||||
}
|
||||
|
||||
// Shared base params for both streaming and non-streaming calls.
|
||||
// Adaptive thinking lets Claude decide how much reasoning is needed per request.
|
||||
// On Opus 4.6, this automatically enables interleaved thinking (thinking between
|
||||
// tool calls) without needing a beta header.
|
||||
function baseParams(
|
||||
systemPrompt: string,
|
||||
conversationMessages: Anthropic.MessageParam[],
|
||||
) {
|
||||
return {
|
||||
model: "claude-opus-4-6" as const,
|
||||
max_tokens: 32_000,
|
||||
system: systemPrompt,
|
||||
messages: conversationMessages,
|
||||
tools: anthropicToolDefinitions as Anthropic.Tool[],
|
||||
thinking: { type: "adaptive" as const },
|
||||
}
|
||||
return toolResults
|
||||
}
|
||||
|
||||
export async function POST(request: NextRequest): Promise<Response> {
|
||||
|
|
@ -55,7 +140,7 @@ export async function POST(request: NextRequest): Promise<Response> {
|
|||
} catch (err) {
|
||||
return Response.json(
|
||||
{ error: err instanceof Error ? err.message : "Configuration error" },
|
||||
{ status: 500 }
|
||||
{ status: 500 },
|
||||
)
|
||||
}
|
||||
|
||||
|
|
@ -70,7 +155,7 @@ export async function POST(request: NextRequest): Promise<Response> {
|
|||
if (!traceId || !messages?.length) {
|
||||
return Response.json(
|
||||
{ error: "traceId and messages are required" },
|
||||
{ status: 400 }
|
||||
{ status: 400 },
|
||||
)
|
||||
}
|
||||
|
||||
|
|
@ -82,7 +167,6 @@ export async function POST(request: NextRequest): Promise<Response> {
|
|||
|
||||
const indexed = indexTraceData(traceData)
|
||||
const systemPrompt = buildSummaryPrompt(indexed)
|
||||
|
||||
const conversationMessages: Anthropic.MessageParam[] = messages.map((m) => ({
|
||||
role: m.role,
|
||||
content: m.content,
|
||||
|
|
@ -96,59 +180,44 @@ export async function POST(request: NextRequest): Promise<Response> {
|
|||
const keepalive = setInterval(() => enqueue(": keepalive\n\n"), KEEPALIVE_INTERVAL_MS)
|
||||
|
||||
try {
|
||||
// Tool resolution loop — sends keepalive pings to prevent gateway timeout
|
||||
// Tool resolution loop — uses stream() + finalMessage() to avoid the SDK's
|
||||
// non-streaming timeout limit (max_tokens 32k estimates >10min, which blocks
|
||||
// client.messages.create()). No text handler is attached, so no text reaches
|
||||
// the frontend — avoiding "stutter" from intermediate fragments.
|
||||
let toolRounds = 0
|
||||
while (toolRounds < MAX_TOOL_ROUNDS) {
|
||||
const response = await client.messages.create({
|
||||
model: "claude-opus-4-6",
|
||||
max_tokens: 4096,
|
||||
system: systemPrompt,
|
||||
messages: conversationMessages,
|
||||
tools: anthropicToolDefinitions as Anthropic.Tool[],
|
||||
})
|
||||
const toolStream = client.messages.stream(baseParams(systemPrompt, conversationMessages))
|
||||
const response = await toolStream.finalMessage()
|
||||
|
||||
if (response.stop_reason !== "tool_use") {
|
||||
break
|
||||
}
|
||||
if (response.stop_reason !== "tool_use") break
|
||||
|
||||
conversationMessages.push({ role: "assistant", content: response.content })
|
||||
const toolResults = await processToolUseResponse(response, indexed)
|
||||
const toolResults = await processToolCalls(response.content, indexed, enqueue)
|
||||
conversationMessages.push({ role: "user", content: toolResults })
|
||||
toolRounds++
|
||||
}
|
||||
|
||||
// Stream the final response
|
||||
const messageStream = client.messages.stream({
|
||||
model: "claude-opus-4-6",
|
||||
max_tokens: 4096,
|
||||
system: systemPrompt,
|
||||
messages: conversationMessages,
|
||||
tools: anthropicToolDefinitions as Anthropic.Tool[],
|
||||
})
|
||||
// Stream the final response — this is the only text the user sees
|
||||
const messageStream = client.messages.stream(baseParams(systemPrompt, conversationMessages))
|
||||
|
||||
messageStream.on("text", (textDelta) => {
|
||||
enqueue(`data: ${JSON.stringify({ text: textDelta })}\n\n`)
|
||||
enqueue(`data: ${JSON.stringify({ type: "text", text: textDelta })}\n\n`)
|
||||
})
|
||||
|
||||
const finalMessage = await messageStream.finalMessage()
|
||||
|
||||
// Edge case: final streaming response ended with tool_use. Process tools
|
||||
// and make one more streaming call without tools to get a text response.
|
||||
if (finalMessage.stop_reason === "tool_use") {
|
||||
conversationMessages.push({ role: "assistant", content: finalMessage.content })
|
||||
const toolResults = await processToolUseResponse(finalMessage, indexed)
|
||||
const toolResults = await processToolCalls(finalMessage.content, indexed, enqueue)
|
||||
conversationMessages.push({ role: "user", content: toolResults })
|
||||
|
||||
const followUpStream = client.messages.stream({
|
||||
model: "claude-opus-4-6",
|
||||
max_tokens: 4096,
|
||||
system: systemPrompt,
|
||||
messages: conversationMessages,
|
||||
const followUp = client.messages.stream(baseParams(systemPrompt, conversationMessages))
|
||||
followUp.on("text", (textDelta) => {
|
||||
enqueue(`data: ${JSON.stringify({ type: "text", text: textDelta })}\n\n`)
|
||||
})
|
||||
|
||||
followUpStream.on("text", (textDelta) => {
|
||||
enqueue(`data: ${JSON.stringify({ text: textDelta })}\n\n`)
|
||||
})
|
||||
|
||||
await followUpStream.finalMessage()
|
||||
await followUp.finalMessage()
|
||||
}
|
||||
|
||||
enqueue("data: [DONE]\n\n")
|
||||
|
|
|
|||
|
|
@ -11,6 +11,13 @@ interface ChatMessage {
|
|||
content: string
|
||||
}
|
||||
|
||||
interface ToolStep {
|
||||
tool: string
|
||||
displayName: string
|
||||
status: "running" | "done"
|
||||
summary?: string
|
||||
}
|
||||
|
||||
interface TimelineChatProps {
|
||||
traceId: string
|
||||
isOpen: boolean
|
||||
|
|
@ -25,6 +32,8 @@ export const TimelineChat = memo(function TimelineChat({
|
|||
const [messages, setMessages] = useState<ChatMessage[]>([])
|
||||
const [input, setInput] = useState("")
|
||||
const [isStreaming, setIsStreaming] = useState(false)
|
||||
const [completedRounds, setCompletedRounds] = useState<ToolStep[][]>([])
|
||||
const [activeSteps, setActiveSteps] = useState<ToolStep[]>([])
|
||||
const messagesEndRef = useRef<HTMLDivElement>(null)
|
||||
const inputRef = useRef<HTMLTextAreaElement>(null)
|
||||
const abortRef = useRef<AbortController | null>(null)
|
||||
|
|
@ -35,7 +44,7 @@ export const TimelineChat = memo(function TimelineChat({
|
|||
|
||||
useEffect(() => {
|
||||
scrollToBottom()
|
||||
}, [messages, scrollToBottom])
|
||||
}, [messages, completedRounds, activeSteps, scrollToBottom])
|
||||
|
||||
useEffect(() => {
|
||||
if (isOpen) {
|
||||
|
|
@ -57,6 +66,8 @@ export const TimelineChat = memo(function TimelineChat({
|
|||
abortRef.current = controller
|
||||
|
||||
setMessages((prev) => [...prev, { role: "assistant", content: "" }])
|
||||
setCompletedRounds([])
|
||||
setActiveSteps([])
|
||||
|
||||
try {
|
||||
const res = await fetch("/api/observability/chat", {
|
||||
|
|
@ -92,14 +103,52 @@ export const TimelineChat = memo(function TimelineChat({
|
|||
|
||||
try {
|
||||
const parsed = JSON.parse(data)
|
||||
if (parsed.text) {
|
||||
|
||||
// Handle typed events (new protocol)
|
||||
if (parsed.type === "tool_start") {
|
||||
setActiveSteps((prev) => {
|
||||
// If all previous steps are done, this is a new round — commit previous steps
|
||||
if (prev.length > 0 && prev.every((s) => s.status === "done")) {
|
||||
setCompletedRounds((rounds) => [...rounds, prev])
|
||||
return [{
|
||||
tool: parsed.tool,
|
||||
displayName: parsed.displayName ?? parsed.tool,
|
||||
status: "running",
|
||||
}]
|
||||
}
|
||||
return [
|
||||
...prev,
|
||||
{
|
||||
tool: parsed.tool,
|
||||
displayName: parsed.displayName ?? parsed.tool,
|
||||
status: "running",
|
||||
},
|
||||
]
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
if (parsed.type === "tool_result") {
|
||||
setActiveSteps((prev) =>
|
||||
prev.map((step) =>
|
||||
step.tool === parsed.tool && step.status === "running"
|
||||
? { ...step, status: "done", summary: parsed.summary }
|
||||
: step,
|
||||
),
|
||||
)
|
||||
continue
|
||||
}
|
||||
|
||||
// Handle text — both new {type: "text", text} and old {text} formats
|
||||
const textContent = parsed.type === "text" ? parsed.text : parsed.text
|
||||
if (textContent) {
|
||||
setMessages((prev) => {
|
||||
const updated = [...prev]
|
||||
const last = updated[updated.length - 1]
|
||||
if (last?.role === "assistant") {
|
||||
updated[updated.length - 1] = {
|
||||
...last,
|
||||
content: last.content + parsed.text,
|
||||
content: last.content + textContent,
|
||||
}
|
||||
}
|
||||
return updated
|
||||
|
|
@ -129,6 +178,13 @@ export const TimelineChat = memo(function TimelineChat({
|
|||
return updated
|
||||
})
|
||||
} finally {
|
||||
// Commit any remaining active steps as a final completed round
|
||||
setActiveSteps((prev) => {
|
||||
if (prev.length > 0) {
|
||||
setCompletedRounds((rounds) => [...rounds, prev])
|
||||
}
|
||||
return []
|
||||
})
|
||||
setIsStreaming(false)
|
||||
abortRef.current = null
|
||||
}
|
||||
|
|
@ -208,6 +264,13 @@ export const TimelineChat = memo(function TimelineChat({
|
|||
{messages.map((msg, i) => (
|
||||
<ChatBubble key={i} message={msg} isStreaming={isStreaming && i === messages.length - 1} />
|
||||
))}
|
||||
|
||||
{completedRounds.map((steps, i) => (
|
||||
<ToolRoundBubble key={`round-${i}`} steps={steps} />
|
||||
))}
|
||||
|
||||
{activeSteps.length > 0 && <ToolRoundBubble steps={activeSteps} isActive />}
|
||||
|
||||
<div ref={messagesEndRef} />
|
||||
</div>
|
||||
|
||||
|
|
@ -253,6 +316,56 @@ export const TimelineChat = memo(function TimelineChat({
|
|||
)
|
||||
})
|
||||
|
||||
const ToolRoundBubble = memo(function ToolRoundBubble({
|
||||
steps,
|
||||
isActive = false,
|
||||
}: {
|
||||
steps: ToolStep[]
|
||||
isActive?: boolean
|
||||
}) {
|
||||
const allDone = steps.every((s) => s.status === "done")
|
||||
|
||||
return (
|
||||
<div className="flex gap-3">
|
||||
<div
|
||||
className={`flex-shrink-0 w-7 h-7 rounded-full flex items-center justify-center ${
|
||||
isActive && !allDone
|
||||
? "bg-amber-100 dark:bg-amber-900/40"
|
||||
: "bg-zinc-100 dark:bg-zinc-800"
|
||||
}`}
|
||||
>
|
||||
{isActive && !allDone ? (
|
||||
<Loader2 className="h-3.5 w-3.5 animate-spin text-amber-600 dark:text-amber-400" />
|
||||
) : (
|
||||
<Check className="h-3.5 w-3.5 text-green-500" />
|
||||
)}
|
||||
</div>
|
||||
<div className="max-w-[85%] rounded-lg px-3 py-2 bg-zinc-50 dark:bg-zinc-800/50 border border-zinc-200 dark:border-zinc-700">
|
||||
<div className="flex flex-col gap-1">
|
||||
{steps.map((step, i) => (
|
||||
<div
|
||||
key={`${step.tool}-${i}`}
|
||||
className="flex items-center gap-2 text-xs text-zinc-500 dark:text-zinc-400"
|
||||
>
|
||||
{step.status === "running" ? (
|
||||
<Loader2 className="h-3 w-3 animate-spin flex-shrink-0 text-amber-500" />
|
||||
) : (
|
||||
<Check className="h-3 w-3 text-green-500 flex-shrink-0" />
|
||||
)}
|
||||
<span>
|
||||
{step.displayName}
|
||||
{step.summary && (
|
||||
<span className="text-zinc-400 dark:text-zinc-500"> — {step.summary}</span>
|
||||
)}
|
||||
</span>
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
})
|
||||
|
||||
const ChatBubble = memo(function ChatBubble({
|
||||
message,
|
||||
isStreaming,
|
||||
|
|
|
|||
|
|
@ -168,75 +168,56 @@ function truncate(s: string, max: number): string {
|
|||
export function buildSummaryPrompt(data: IndexedTraceData): string {
|
||||
const lines: string[] = []
|
||||
|
||||
// === SECTION 1: ROLE + CRITICAL BEHAVIOR (primacy — first thing the model sees) ===
|
||||
lines.push(
|
||||
"You are an assistant helping a developer understand an optimization trace from Codeflash. " +
|
||||
"You have tools to fetch specific data from the trace on demand. Use them to answer the user's " +
|
||||
"questions — only fetch what you need. Be concise and reference specific candidates when relevant.\n\n" +
|
||||
"IMPORTANT: The user may paste errors, warnings, logs, or other output from their CLI runs that " +
|
||||
"are NOT captured in the trace data you have access to. The trace data is incomplete — it does not " +
|
||||
"record everything that happens during an optimization run. When the user shares information, trust " +
|
||||
"it as ground truth even if the trace data doesn't confirm it. Never say 'the trace shows no errors' " +
|
||||
"to contradict what the user is reporting. Instead, use the trace data you DO have to help explain " +
|
||||
"and investigate what they're seeing.\n\n" +
|
||||
"You also have codebase browsing tools (read_file, search_code, list_directory) that let you " +
|
||||
"navigate the codeflash-internal and codeflash CLI source code, and a get_llm_call_detail tool " +
|
||||
"to inspect the full prompts and responses of any LLM call in this trace. Use these to trace " +
|
||||
"problems end-to-end: see what went wrong in the trace, read the actual prompts sent, then " +
|
||||
"navigate to the pipeline code to suggest a concrete fix.",
|
||||
"You are an investigation agent for Codeflash optimization traces. Your job is to diagnose " +
|
||||
"problems, trace them to root causes in the code, and recommend specific fixes.\n\n" +
|
||||
"CRITICAL RULES:\n" +
|
||||
"1. You MUST use tools before answering any diagnostic question. Never speculate when you can look.\n" +
|
||||
"2. Use tools liberally. It is always better to over-investigate than to give a shallow answer.\n" +
|
||||
"3. When multiple tool calls are independent, call them in parallel to save time.\n" +
|
||||
"4. Trust user-provided information (CLI output, errors, logs) as ground truth, even when trace " +
|
||||
"data doesn't confirm it. The trace is incomplete — it does not capture everything.\n" +
|
||||
"5. Your output should be concise and reference specific candidates, file paths, and line numbers.",
|
||||
)
|
||||
|
||||
// === SECTION 2: TOOL REFERENCE (when to use each) ===
|
||||
lines.push("")
|
||||
lines.push("=== DEBUGGING TOOLS — USE THESE PROACTIVELY ===")
|
||||
lines.push("=== TOOL REFERENCE ===")
|
||||
lines.push(
|
||||
"You have tools beyond trace data. Your job is not just to describe what happened — it's to " +
|
||||
"investigate WHY it happened and point to the specific code or prompt that needs to change. " +
|
||||
"Always go one level deeper than the surface-level observation.\n\n" +
|
||||
"IMPORTANT: When you identify a problem (bad tests, failed optimizations, parsing errors, etc.), " +
|
||||
"you MUST use get_llm_call_detail to inspect the actual prompts and responses involved. Then, if " +
|
||||
"the issue traces back to a prompt or pipeline bug, use the codebase browsing tools to find the " +
|
||||
"source code and suggest a concrete fix. Do not stop at 'the tests used mocks' — find out what " +
|
||||
"prompt instructions led to that and where to fix them.\n\n" +
|
||||
"=== get_llm_call_detail(call_id) ===\n" +
|
||||
"Fetches the full system prompt, user prompt, raw LLM response, and parsing results for any " +
|
||||
"LLM call in this trace. You SHOULD use this:\n" +
|
||||
"- When analyzing test quality: inspect the testgen prompt to see what instructions the model " +
|
||||
"received. Did the prompt forbid mocks? Did it provide enough context about the classes?\n" +
|
||||
"- When investigating bad optimizations: read the optimizer prompt to check if context was " +
|
||||
"missing or if instructions were unclear\n" +
|
||||
"- When debugging parsing failures: compare raw_response vs parsed_response to find extraction bugs\n" +
|
||||
"- When understanding ranking decisions: read the ranker prompt and response\n\n" +
|
||||
"=== read_file, search_code, list_directory ===\n" +
|
||||
"Browse the codeflash-internal and codeflash (CLI) source repos. You SHOULD use these:\n" +
|
||||
"- After inspecting an LLM call, find the prompt template to suggest a specific fix\n" +
|
||||
"- To understand how a pipeline stage works (postprocessing, deduplication, instrumentation)\n" +
|
||||
"- To trace a code path from an LLM call back to the pipeline logic that invoked it\n" +
|
||||
"- When the user asks 'where does X happen' or 'why does Y behave this way'\n\n" +
|
||||
"TRACE DATA TOOLS (for understanding what happened):\n" +
|
||||
"• get_errors — Start here. Shows all errors from the optimization (test failures, parsing errors). " +
|
||||
"Use this first to understand what went wrong before diving deeper.\n" +
|
||||
"• get_original_code — Returns the original function source. Use when you need to understand what " +
|
||||
"was being optimized.\n" +
|
||||
"• get_candidate_code — Returns a specific optimization candidate with its explanation, model, and " +
|
||||
"rank. Use when investigating why a candidate was ranked low or failed.\n" +
|
||||
"• get_test_code — Returns generated/instrumented test code. Use when investigating test quality " +
|
||||
"issues (mocks, bad assertions, missing coverage).\n" +
|
||||
"• get_ranking_details — Returns the full ranking explanation. Use when investigating why candidates " +
|
||||
"were ranked the way they were.\n\n" +
|
||||
"DEBUGGING TOOLS (for understanding WHY it happened):\n" +
|
||||
"• get_llm_call_detail — YOUR PRIMARY DEBUGGING TOOL. Fetches the full system prompt, user prompt, " +
|
||||
"raw response, and parsing results for any LLM call. Use this FIRST when investigating any " +
|
||||
"LLM-related issue: bad tests, bad optimizations, parsing failures, ranking decisions. This tool " +
|
||||
"tells you exactly what instructions the model received and what it produced.\n\n" +
|
||||
"CODEBASE TOOLS (for finding where to fix it):\n" +
|
||||
"• search_code — Ripgrep search across codeflash-internal or codeflash CLI repos. Use AFTER " +
|
||||
"inspecting an LLM call to find the prompt template or pipeline code responsible. Search for " +
|
||||
"distinctive phrases from the prompt to locate the template file.\n" +
|
||||
"• read_file — Read a file with line numbers. Use after search_code to read full context around " +
|
||||
"a match. Also use to read prompt templates (.md files) and pipeline modules.\n" +
|
||||
"• list_directory — List files/dirs in a repo path. Use to orient yourself when navigating " +
|
||||
"unfamiliar parts of the codebase.\n\n" +
|
||||
"Key paths in codeflash-internal:\n" +
|
||||
"- django/aiservice/core/shared/ — optimizer_router, testgen_router, ranker\n" +
|
||||
"- django/aiservice/core/languages/python/optimizer/ — Python optimizer pipeline\n" +
|
||||
"- django/aiservice/core/languages/python/testgen/ — test generation pipeline\n" +
|
||||
"- django/aiservice/aiservice/llm.py — LLM provider abstraction\n" +
|
||||
"- Prompt templates are .md files alongside their modules (rendered with Jinja2)\n\n" +
|
||||
"=== EXPECTED WORKFLOW — YOU MUST COMPLETE ALL STEPS ===\n" +
|
||||
"When you find a problem in a trace, DO NOT stop at describing the symptoms. You MUST complete " +
|
||||
"the full investigation:\n\n" +
|
||||
"1. OBSERVE: Answer the user's question using trace data tools (get_test_code, get_candidate_code, etc.)\n" +
|
||||
"2. INVESTIGATE: Use get_llm_call_detail to read the prompts and responses that caused the problem. " +
|
||||
"Identify whether the issue is a prompt gap, a model failure to follow instructions, or a pipeline bug.\n" +
|
||||
"3. LOCATE: Use search_code to find the prompt template or pipeline code responsible. Read it with " +
|
||||
"read_file. Prompt templates are .md files — search for distinctive phrases from the prompt you found " +
|
||||
"in step 2 to locate the template file.\n" +
|
||||
"4. RECOMMEND: Suggest a concrete fix — name the file, quote the relevant section, and describe " +
|
||||
"what to change. For example: 'In django/aiservice/core/languages/python/testgen/prompt.md, the " +
|
||||
"no-mocks instruction at line 45 should be moved to the system prompt for stronger enforcement.'\n\n" +
|
||||
"If you skip steps 3-4, your response is INCOMPLETE. The user is a developer who wants actionable " +
|
||||
"fixes, not just observations about what went wrong.\n\n" +
|
||||
"HARD REQUIREMENT: When you identify a problem caused by a prompt or pipeline stage, your response " +
|
||||
"MUST include at least one real file path from the codebase that you found via search_code or " +
|
||||
"read_file. Generic advice like 'strengthen the prompt' is not enough — find the actual file, " +
|
||||
"read it, and reference the specific lines that need to change.",
|
||||
"- Prompt templates are .md files alongside their modules (rendered with Jinja2)",
|
||||
)
|
||||
|
||||
// === SECTION 3: DOMAIN KNOWLEDGE (middle) ===
|
||||
lines.push("")
|
||||
lines.push("=== CODEFLASH TESTING GUIDELINES ===")
|
||||
lines.push(
|
||||
|
|
@ -253,6 +234,7 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
|
|||
"the cause before blaming the optimizations.",
|
||||
)
|
||||
|
||||
// === SECTION 4: TRACE OVERVIEW (the data itself) ===
|
||||
lines.push("")
|
||||
lines.push("=== TRACE OVERVIEW ===")
|
||||
|
||||
|
|
@ -314,17 +296,33 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
|
|||
}
|
||||
}
|
||||
|
||||
// === SECTION 5: WORKFLOW + CHECKLIST (recency — last thing the model sees) ===
|
||||
lines.push("")
|
||||
lines.push("=== RESPONSE CHECKLIST (review before responding) ===")
|
||||
lines.push("=== INVESTIGATION WORKFLOW ===")
|
||||
lines.push(
|
||||
"Before you send your response, verify:\n" +
|
||||
"[ ] If you identified a problem (bad tests, failed optimization, parsing error, etc.), did you " +
|
||||
"use get_llm_call_detail to read the actual prompt/response that caused it?\n" +
|
||||
"[ ] If the root cause is in a prompt or pipeline, did you use search_code and read_file to " +
|
||||
"find the actual source file? Your response MUST include at least one real file path from the " +
|
||||
"codebase (e.g., 'django/aiservice/core/languages/python/testgen/system_prompt.md').\n" +
|
||||
"[ ] Are your recommendations grounded in specific code you read, not generic advice?\n\n" +
|
||||
"If any box is unchecked, go back and use the tools before responding.",
|
||||
"When the user asks about a problem, follow these steps IN ORDER. Do not skip steps.\n\n" +
|
||||
"1. OBSERVE: Use get_errors and trace data tools to understand what happened.\n" +
|
||||
"2. INVESTIGATE: Use get_llm_call_detail to read the actual prompts and responses that caused the " +
|
||||
"problem. Identify whether the issue is a prompt gap, a model failure, or a pipeline bug.\n" +
|
||||
"3. LOCATE: Use search_code to find the prompt template or pipeline code responsible. Read it with " +
|
||||
"read_file. Prompt templates are .md files — search for distinctive phrases from the prompt you " +
|
||||
"found in step 2 to locate the template file.\n" +
|
||||
"4. RECOMMEND: Name the file, quote the relevant section, describe what to change.\n\n" +
|
||||
"EXAMPLE of a good investigation:\n" +
|
||||
" User: 'Why did the tests use mocks?'\n" +
|
||||
" → get_test_code(index=1) → sees Mock objects in generated tests\n" +
|
||||
" → get_llm_call_detail(call_id='...testgen...') → reads the system prompt, finds no anti-mock instruction\n" +
|
||||
" → search_code(repo='codeflash-internal', pattern='mock.*MagicMock', glob='*.md') → finds prompt template\n" +
|
||||
" → read_file(repo='codeflash-internal', path='django/aiservice/.../testgen/system_prompt.md') → reads it\n" +
|
||||
" → Response: 'The testgen system prompt at django/.../testgen/system_prompt.md:42 does not " +
|
||||
"explicitly forbid mocks. Add an instruction after line 42: \"Never use Mock, MagicMock, patch, " +
|
||||
"or SimpleNamespace. Always construct real instances.\"'\n\n" +
|
||||
"BEFORE YOU RESPOND, verify:\n" +
|
||||
"- Did you use get_llm_call_detail to read the actual prompt/response?\n" +
|
||||
"- Did you use search_code/read_file to find the source file?\n" +
|
||||
"- Does your response include at least one real file path with line numbers?\n" +
|
||||
"- Are your recommendations grounded in code you actually read?\n\n" +
|
||||
"If any answer is no, GO BACK AND USE THE TOOLS. Do not respond with generic advice.",
|
||||
)
|
||||
|
||||
return lines.join("\n")
|
||||
|
|
@ -334,14 +332,17 @@ export const anthropicToolDefinitions = [
|
|||
{
|
||||
name: "get_original_code",
|
||||
description:
|
||||
"Returns the original source code of the function being optimized.",
|
||||
"Returns the original source code of the function being optimized. Use this when you need " +
|
||||
"to understand what was being optimized, compare with candidate code, or check if the " +
|
||||
"original function has patterns that explain optimization failures.",
|
||||
input_schema: { type: "object" as const, properties: {} },
|
||||
},
|
||||
{
|
||||
name: "get_candidate_code",
|
||||
description:
|
||||
"Returns the optimized code, explanation, and metadata for a specific candidate. " +
|
||||
"Use source_type and index to identify the candidate (e.g., source_type='OPTIMIZE', index=1 for the first optimization candidate).",
|
||||
"Returns the optimized code, explanation, model, and rank for a specific candidate. " +
|
||||
"Use this when investigating why a candidate was ranked low, failed tests, or produced " +
|
||||
"incorrect output. Call this for multiple candidates to compare approaches.",
|
||||
input_schema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
|
|
@ -361,8 +362,10 @@ export const anthropicToolDefinitions = [
|
|||
{
|
||||
name: "get_test_code",
|
||||
description:
|
||||
"Returns the generated test code for a specific test group, including which model generated it. " +
|
||||
"Each test group may have generated, instrumented, and performance-instrumented variants.",
|
||||
"Returns the generated test code for a specific test group with model info. " +
|
||||
"Use this when investigating test quality issues — look for Mock, MagicMock, patch, " +
|
||||
"SimpleNamespace, or other fake objects. Also use when candidates fail tests to check " +
|
||||
"whether the tests themselves are the problem.",
|
||||
input_schema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
|
|
@ -377,20 +380,28 @@ export const anthropicToolDefinitions = [
|
|||
{
|
||||
name: "get_ranking_details",
|
||||
description:
|
||||
"Returns the full ranking explanation and the ordered list of candidates.",
|
||||
"Returns the full ranking explanation and ordered candidate list. Use this when " +
|
||||
"investigating why certain candidates were preferred, why good-looking code was ranked low, " +
|
||||
"or why the ranking disagrees with expected results.",
|
||||
input_schema: { type: "object" as const, properties: {} },
|
||||
},
|
||||
{
|
||||
name: "get_errors",
|
||||
description:
|
||||
"Returns all errors encountered during the optimization, including test failures and their context.",
|
||||
"Returns ALL errors from the optimization: test failures, parsing errors, pipeline errors, " +
|
||||
"and their context. This should be one of your FIRST calls when investigating any failure — " +
|
||||
"it tells you what went wrong before you dive into why.",
|
||||
input_schema: { type: "object" as const, properties: {} },
|
||||
},
|
||||
{
|
||||
name: "get_llm_call_detail",
|
||||
description:
|
||||
"Returns the full prompt, response, and parsing results for a specific LLM call. " +
|
||||
"Use the call_id from the LLM call IDs listed in the trace overview.",
|
||||
"YOUR PRIMARY DEBUGGING TOOL. Fetches the full system prompt, user prompt, raw LLM " +
|
||||
"response, parsed response, and parsing results for any LLM call in this trace. " +
|
||||
"Use this FIRST when investigating any LLM-related issue: bad tests, bad optimizations, " +
|
||||
"parsing failures, ranking decisions. This tells you exactly what instructions the model " +
|
||||
"received and what it produced. After reading the prompt, use search_code to find the " +
|
||||
"template file and suggest a fix.",
|
||||
input_schema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
|
|
@ -405,8 +416,10 @@ export const anthropicToolDefinitions = [
|
|||
{
|
||||
name: "read_file",
|
||||
description:
|
||||
"Read a file from the codeflash-internal or codeflash CLI repository. " +
|
||||
"Returns file content with line numbers. Use start_line/end_line for large files.",
|
||||
"Read a file from the codeflash-internal or codeflash CLI repository with line numbers. " +
|
||||
"Use this AFTER search_code to read full context around a match. Essential for reading " +
|
||||
"prompt templates (.md files) and pipeline code. Your investigation is incomplete until " +
|
||||
"you've read the relevant source file.",
|
||||
input_schema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
|
|
@ -434,7 +447,10 @@ export const anthropicToolDefinitions = [
|
|||
{
|
||||
name: "search_code",
|
||||
description:
|
||||
"Search for a pattern across a repository using ripgrep. Returns matching lines with file paths and line numbers.",
|
||||
"Ripgrep search across a repository. Returns matching lines with file paths and line " +
|
||||
"numbers. Use this AFTER get_llm_call_detail to find the prompt template or pipeline " +
|
||||
"code responsible for a problem. Search for distinctive phrases from the prompt to locate " +
|
||||
"the template file. Also use to find where specific pipeline stages are implemented.",
|
||||
input_schema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
|
|
@ -445,11 +461,11 @@ export const anthropicToolDefinitions = [
|
|||
},
|
||||
pattern: {
|
||||
type: "string",
|
||||
description: "Regex pattern to search for (ripgrep syntax).",
|
||||
description: "Regex pattern to search for (ripgrep syntax). Be specific — use distinctive phrases.",
|
||||
},
|
||||
glob: {
|
||||
type: "string",
|
||||
description: "File glob filter (e.g., '*.py', '*.ts').",
|
||||
description: "File glob filter (e.g., '*.py', '*.md', '*.ts'). Use '*.md' for prompt templates.",
|
||||
},
|
||||
max_results: {
|
||||
type: "number",
|
||||
|
|
@ -462,7 +478,9 @@ export const anthropicToolDefinitions = [
|
|||
{
|
||||
name: "list_directory",
|
||||
description:
|
||||
"List files and directories in a repository path. Directories are listed first, sorted alphabetically.",
|
||||
"List files and directories in a repository path. Use this to orient yourself when " +
|
||||
"navigating unfamiliar parts of the codebase, or to discover prompt template files " +
|
||||
"alongside pipeline modules.",
|
||||
input_schema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
|
|
|
|||
Loading…
Reference in a new issue