feat: add codebase browsing and LLM call inspection to observability chat

Give the observability chat agent four new tools: get_llm_call_detail (full prompt/response for any LLM call), read_file, search_code, and list_directory for navigating the codeflash-internal and codeflash CLI repos. This lets the agent trace problems end-to-end from trace data through actual prompts to pipeline source code. - Add id to IndexedTraceData.llmCalls so the agent can reference calls - Make resolveToolCall async (Prisma + fs + child_process) - Make processToolUseResponse async to match - Bump MAX_TOOL_ROUNDS from 5 to 15 for multi-step code browsing - Add CODEFLASH_INTERNAL_REPO_PATH / CODEFLASH_CLI_REPO_PATH env vars - Path traversal protection, file size caps, search result limits
2026-05-04 18:25:18 +00:00 · 2026-02-15 00:03:25 -05:00 · 2026-02-15 00:03:25 -05:00 · 782ee508de
commit 782ee508de
parent eecd3ba4ce
3 changed files with 334 additions and 11 deletions
--- a/js/cf-webapp/.env.example
+++ b/js/cf-webapp/.env.example
@ -8,3 +8,5 @@ NPM_TOKEN
 SCM_DO_BUILD_DURING_DEPLOYMENT
 WEBSITE_HEALTHCHECK_MAXPINGFAILURES
 WEBSITE_HTTPLOGGING_RETENTION_DAYS
+CODEFLASH_INTERNAL_REPO_PATH=
+CODEFLASH_CLI_REPO_PATH=
--- a/js/cf-webapp/src/app/api/observability/chat/route.ts
+++ b/js/cf-webapp/src/app/api/observability/chat/route.ts
@ -14,7 +14,7 @@ interface ChatMessage {
  content: string
 }

-const MAX_TOOL_ROUNDS = 5
+const MAX_TOOL_ROUNDS = 15
 const KEEPALIVE_INTERVAL_MS = 15_000

 function getClient(): Anthropic {
@ -26,14 +26,14 @@ function getClient(): Anthropic {
  return new Anthropic({ baseURL, apiKey })
 }

-function processToolUseResponse(
+async function processToolUseResponse(
  response: Anthropic.Message,
  indexed: IndexedTraceData
-): Anthropic.ToolResultBlockParam[] {
+): Promise<Anthropic.ToolResultBlockParam[]> {
  const toolResults: Anthropic.ToolResultBlockParam[] = []
  for (const block of response.content) {
    if (block.type === "tool_use") {
-      const result = resolveToolCall(
+      const result = await resolveToolCall(
        block.name,
        (block.input as Record<string, unknown>) ?? {},
        indexed
@ -112,7 +112,7 @@ export async function POST(request: NextRequest): Promise<Response> {
          }

          conversationMessages.push({ role: "assistant", content: response.content })
-          const toolResults = processToolUseResponse(response, indexed)
+          const toolResults = await processToolUseResponse(response, indexed)
          conversationMessages.push({ role: "user", content: toolResults })
          toolRounds++
        }
@ -134,7 +134,7 @@ export async function POST(request: NextRequest): Promise<Response> {

        if (finalMessage.stop_reason === "tool_use") {
          conversationMessages.push({ role: "assistant", content: finalMessage.content })
-          const toolResults = processToolUseResponse(finalMessage, indexed)
+          const toolResults = await processToolUseResponse(finalMessage, indexed)
          conversationMessages.push({ role: "user", content: toolResults })

          const followUpStream = client.messages.stream({
--- a/js/cf-webapp/src/app/observability/lib/build-chat-context.ts
+++ b/js/cf-webapp/src/app/observability/lib/build-chat-context.ts
@ -1,3 +1,7 @@
+import path from "node:path"
+import { readdir, readFile, stat } from "node:fs/promises"
+import { execFile } from "node:child_process"
+import { prisma } from "@/lib/prisma"
 import type { TraceData } from "./get-trace-data"

 export interface IndexedTraceData {
@ -27,6 +31,7 @@ export interface IndexedTraceData {
    context: Record<string, unknown> | null
  }>
  llmCalls: Array<{
+    id: string
    call_type: string | null
    model_name: string | null
    status: string
@ -107,6 +112,7 @@ export function indexTraceData(traceData: TraceData): IndexedTraceData {
      context: e.context as Record<string, unknown> | null,
    })),
    llmCalls: rawLlmCalls.map((c) => ({
+      id: c.id,
      call_type: c.call_type,
      model_name: c.model_name,
      status: c.status,
@ -129,6 +135,36 @@ function findModelForTestGroup(testIndex: number, data: IndexedTraceData): strin
  return match?.model_name ?? null
 }

+// --- Codebase browsing helpers ---
+
+function getRepoRoot(repo: string): string | null {
+  if (repo === "codeflash-internal") return process.env.CODEFLASH_INTERNAL_REPO_PATH || null
+  if (repo === "codeflash") return process.env.CODEFLASH_CLI_REPO_PATH || null
+  return null
+}
+
+function resolveAndValidatePath(
+  repoRoot: string,
+  relativePath: string,
+): { resolved: string } | { error: string } {
+  const normalized = path.normalize(relativePath)
+  if (normalized.startsWith("..") || path.isAbsolute(normalized)) {
+    return { error: "Path traversal is not allowed." }
+  }
+  const resolved = path.resolve(repoRoot, normalized)
+  if (!resolved.startsWith(repoRoot)) {
+    return { error: "Path traversal is not allowed." }
+  }
+  return { resolved }
+}
+
+function truncate(s: string, max: number): string {
+  if (s.length <= max) return s
+  return s.slice(0, max) + `\n... [truncated at ${max} chars]`
+}
+
+// --- System prompt ---
+
 export function buildSummaryPrompt(data: IndexedTraceData): string {
  const lines: string[] = []

@ -141,7 +177,12 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
      "record everything that happens during an optimization run. When the user shares information, trust " +
      "it as ground truth even if the trace data doesn't confirm it. Never say 'the trace shows no errors' " +
      "to contradict what the user is reporting. Instead, use the trace data you DO have to help explain " +
-      "and investigate what they're seeing."
+      "and investigate what they're seeing.\n\n" +
+      "You also have codebase browsing tools (read_file, search_code, list_directory) that let you " +
+      "navigate the codeflash-internal and codeflash CLI source code, and a get_llm_call_detail tool " +
+      "to inspect the full prompts and responses of any LLM call in this trace. Use these to trace " +
+      "problems end-to-end: see what went wrong in the trace, read the actual prompts sent, then " +
+      "navigate to the pipeline code to suggest a concrete fix.",
  )

  lines.push("")
@ -157,7 +198,7 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
      "Import the real classes and call their real constructors with real arguments. This ensures " +
      "attribute access, method calls, and object protocols behave identically to production.\n" +
      "- If many candidates fail or are unranked, check whether mock/fake objects in tests could be " +
-      "the cause before blaming the optimizations."
+      "the cause before blaming the optimizations.",
  )

  lines.push("")
@ -212,6 +253,15 @@ export function buildSummaryPrompt(data: IndexedTraceData): string {
    }
  }

+  if (data.llmCalls.length > 0) {
+    lines.push("")
+    lines.push("=== LLM CALL IDS (for get_llm_call_detail) ===")
+    for (const c of data.llmCalls) {
+      const cost = c.llm_cost != null ? ` $${c.llm_cost.toFixed(4)}` : ""
+      lines.push(`  - ${c.id}: ${c.call_type ?? "unknown"} (${c.model_name ?? "unknown"}, ${c.status}${cost})`)
+    }
+  }
+
  return lines.join("\n")
 }

@ -271,13 +321,106 @@ export const anthropicToolDefinitions = [
      "Returns all errors encountered during the optimization, including test failures and their context.",
    input_schema: { type: "object" as const, properties: {} },
  },
+  {
+    name: "get_llm_call_detail",
+    description:
+      "Returns the full prompt, response, and parsing results for a specific LLM call. " +
+      "Use the call_id from the LLM call IDs listed in the trace overview.",
+    input_schema: {
+      type: "object" as const,
+      properties: {
+        call_id: {
+          type: "string",
+          description: "UUID of the LLM call (from the trace overview).",
+        },
+      },
+      required: ["call_id"],
+    },
+  },
+  {
+    name: "read_file",
+    description:
+      "Read a file from the codeflash-internal or codeflash CLI repository. " +
+      "Returns file content with line numbers. Use start_line/end_line for large files.",
+    input_schema: {
+      type: "object" as const,
+      properties: {
+        repo: {
+          type: "string",
+          enum: ["codeflash-internal", "codeflash"],
+          description: "Which repository to read from.",
+        },
+        path: {
+          type: "string",
+          description: "Relative path within the repo (e.g., 'django/aiservice/core/shared/optimizer_router.py').",
+        },
+        start_line: {
+          type: "number",
+          description: "1-based start line (default: 1).",
+        },
+        end_line: {
+          type: "number",
+          description: "1-based end line (default: start_line + 499).",
+        },
+      },
+      required: ["repo", "path"],
+    },
+  },
+  {
+    name: "search_code",
+    description:
+      "Search for a pattern across a repository using ripgrep. Returns matching lines with file paths and line numbers.",
+    input_schema: {
+      type: "object" as const,
+      properties: {
+        repo: {
+          type: "string",
+          enum: ["codeflash-internal", "codeflash"],
+          description: "Which repository to search.",
+        },
+        pattern: {
+          type: "string",
+          description: "Regex pattern to search for (ripgrep syntax).",
+        },
+        glob: {
+          type: "string",
+          description: "File glob filter (e.g., '*.py', '*.ts').",
+        },
+        max_results: {
+          type: "number",
+          description: "Maximum number of matching lines to return (default: 30, max: 100).",
+        },
+      },
+      required: ["repo", "pattern"],
+    },
+  },
+  {
+    name: "list_directory",
+    description:
+      "List files and directories in a repository path. Directories are listed first, sorted alphabetically.",
+    input_schema: {
+      type: "object" as const,
+      properties: {
+        repo: {
+          type: "string",
+          enum: ["codeflash-internal", "codeflash"],
+          description: "Which repository to browse.",
+        },
+        path: {
+          type: "string",
+          description: "Relative path within the repo (default: root).",
+        },
+      },
+      required: ["repo"],
+    },
+  },
 ]

-export function resolveToolCall(
+export async function resolveToolCall(
  name: string,
  args: Record<string, unknown>,
-  data: IndexedTraceData
-): string {
+  data: IndexedTraceData,
+): Promise<string> {
  switch (name) {
    case "get_original_code":
      return data.originalCode ?? "No original code available."
@ -344,6 +487,184 @@ export function resolveToolCall(
      return parts.join("\n\n")
    }

+    case "get_llm_call_detail": {
+      const callId = args.call_id as string
+      if (!data.llmCalls.some((c) => c.id === callId)) {
+        return `LLM call ${callId} not found in this trace. Available IDs: ${data.llmCalls.map((c) => c.id).join(", ")}`
+      }
+      const row = await prisma.llm_calls.findUnique({
+        where: { id: callId },
+        select: {
+          call_type: true,
+          model_name: true,
+          status: true,
+          system_prompt: true,
+          user_prompt: true,
+          messages: true,
+          raw_response: true,
+          parsed_response: true,
+          temperature: true,
+          n_candidates: true,
+          max_tokens: true,
+          prompt_tokens: true,
+          completion_tokens: true,
+          total_tokens: true,
+          llm_cost: true,
+          latency_ms: true,
+          parsing_status: true,
+          candidates_generated: true,
+          candidates_valid: true,
+          parsing_errors: true,
+          error_type: true,
+          error_message: true,
+        },
+      })
+      if (!row) return `LLM call ${callId} not found in database.`
+
+      const MAX_FIELD = 15_000
+      const parts: string[] = []
+      parts.push(`Call type: ${row.call_type}`)
+      parts.push(`Model: ${row.model_name}`)
+      parts.push(`Status: ${row.status}`)
+      if (row.temperature != null) parts.push(`Temperature: ${row.temperature}`)
+      if (row.n_candidates != null) parts.push(`N candidates requested: ${row.n_candidates}`)
+      if (row.max_tokens != null) parts.push(`Max tokens: ${row.max_tokens}`)
+      parts.push(`Tokens: ${row.prompt_tokens ?? "?"}p / ${row.completion_tokens ?? "?"}c / ${row.total_tokens ?? "?"}t`)
+      if (row.llm_cost != null) parts.push(`Cost: $${row.llm_cost.toFixed(4)}`)
+      if (row.latency_ms != null) parts.push(`Latency: ${row.latency_ms}ms`)
+
+      if (row.system_prompt) {
+        parts.push(`\n--- System Prompt ---\n${truncate(row.system_prompt, MAX_FIELD)}`)
+      }
+      if (row.user_prompt) {
+        parts.push(`\n--- User Prompt ---\n${truncate(row.user_prompt, MAX_FIELD)}`)
+      }
+      if (row.messages) {
+        const messagesStr = JSON.stringify(row.messages, null, 2)
+        parts.push(`\n--- Messages ---\n${truncate(messagesStr, MAX_FIELD)}`)
+      }
+      if (row.raw_response) {
+        parts.push(`\n--- Raw Response ---\n${truncate(row.raw_response, MAX_FIELD)}`)
+      }
+      if (row.parsed_response) {
+        const parsedStr = JSON.stringify(row.parsed_response, null, 2)
+        parts.push(`\n--- Parsed Response ---\n${truncate(parsedStr, MAX_FIELD)}`)
+      }
+
+      if (row.parsing_status) {
+        parts.push(`\nParsing status: ${row.parsing_status}`)
+        if (row.candidates_generated != null) parts.push(`Candidates generated: ${row.candidates_generated}`)
+        if (row.candidates_valid != null) parts.push(`Candidates valid: ${row.candidates_valid}`)
+        if (row.parsing_errors) {
+          parts.push(`Parsing errors: ${truncate(JSON.stringify(row.parsing_errors, null, 2), 5000)}`)
+        }
+      }
+
+      if (row.error_type) parts.push(`\nError: [${row.error_type}] ${row.error_message ?? ""}`)
+
+      return parts.join("\n")
+    }
+
+    case "read_file": {
+      const repoRoot = getRepoRoot(args.repo as string)
+      if (!repoRoot) return `Repository path not configured. Set CODEFLASH_INTERNAL_REPO_PATH or CODEFLASH_CLI_REPO_PATH env var.`
+
+      const pathResult = resolveAndValidatePath(repoRoot, args.path as string)
+      if ("error" in pathResult) return pathResult.error
+
+      try {
+        const fileStat = await stat(pathResult.resolved)
+        if (!fileStat.isFile()) return `Not a file: ${args.path}`
+        if (fileStat.size > 1_000_000) return `File too large (${fileStat.size} bytes). Max 1MB.`
+
+        const content = await readFile(pathResult.resolved, "utf-8")
+        const allLines = content.split("\n")
+        const startLine = Math.max(1, (args.start_line as number) || 1)
+        const endLine = Math.min(allLines.length, (args.end_line as number) || startLine + 499)
+        const slice = allLines.slice(startLine - 1, endLine)
+
+        const numbered = slice.map((line, i) => `${startLine + i}: ${line}`).join("\n")
+        const header = `File: ${args.path} (lines ${startLine}-${endLine} of ${allLines.length})`
+        return `${header}\n\n${numbered}`
+      } catch (err) {
+        if ((err as NodeJS.ErrnoException).code === "ENOENT") return `File not found: ${args.path}`
+        return `Error reading file: ${(err as Error).message}`
+      }
+    }
+
+    case "search_code": {
+      const repoRoot = getRepoRoot(args.repo as string)
+      if (!repoRoot) return `Repository path not configured. Set CODEFLASH_INTERNAL_REPO_PATH or CODEFLASH_CLI_REPO_PATH env var.`
+
+      const maxResults = Math.min(Math.max(1, (args.max_results as number) || 30), 100)
+      const rgArgs = [
+        "--no-heading",
+        "--line-number",
+        "--color=never",
+        "--max-count", String(maxResults),
+        "--glob", "!.git",
+        "--glob", "!node_modules",
+        "--glob", "!__pycache__",
+        "--glob", "!.next",
+        "--glob", "!dist",
+      ]
+      if (args.glob) {
+        rgArgs.push("--glob", args.glob as string)
+      }
+      rgArgs.push(args.pattern as string, repoRoot)
+
+      return new Promise<string>((resolve) => {
+        execFile("rg", rgArgs, { timeout: 10_000, maxBuffer: 1_000_000 }, (err, stdout, stderr) => {
+          if (err && !stdout) {
+            if ((err as NodeJS.ErrnoException).code === "ENOENT") {
+              resolve("ripgrep (rg) not found. Install it to use search_code.")
+              return
+            }
+            if (stderr) {
+              resolve(`Search error: ${stderr.slice(0, 500)}`)
+              return
+            }
+            resolve("No matches found.")
+            return
+          }
+          // Strip the repo root prefix from output paths for readability
+          const cleaned = stdout.replace(new RegExp(repoRoot.replace(/[.*+?^${}()|[\]\\]/g, "\\$&") + "/", "g"), "")
+          resolve(cleaned.trim() || "No matches found.")
+        })
+      })
+    }
+
+    case "list_directory": {
+      const repoRoot = getRepoRoot(args.repo as string)
+      if (!repoRoot) return `Repository path not configured. Set CODEFLASH_INTERNAL_REPO_PATH or CODEFLASH_CLI_REPO_PATH env var.`
+
+      const relativePath = (args.path as string) || "."
+      const pathResult = resolveAndValidatePath(repoRoot, relativePath)
+      if ("error" in pathResult) return pathResult.error
+
+      try {
+        const entries = await readdir(pathResult.resolved, { withFileTypes: true })
+        const dirs: string[] = []
+        const files: string[] = []
+        for (const entry of entries) {
+          if (entry.name === ".git") continue
+          if (entry.isDirectory()) dirs.push(entry.name + "/")
+          else files.push(entry.name)
+        }
+        dirs.sort()
+        files.sort()
+        const all = [...dirs, ...files]
+        if (all.length === 0) return `Empty directory: ${relativePath}`
+        const capped = all.slice(0, 200)
+        const suffix = all.length > 200 ? `\n... and ${all.length - 200} more entries` : ""
+        return `Directory: ${relativePath}\n\n${capped.join("\n")}${suffix}`
+      } catch (err) {
+        if ((err as NodeJS.ErrnoException).code === "ENOENT") return `Directory not found: ${relativePath}`
+        if ((err as NodeJS.ErrnoException).code === "ENOTDIR") return `Not a directory: ${relativePath}`
+        return `Error listing directory: ${(err as Error).message}`
+      }
+    }
+
    default:
      return `Unknown tool: ${name}`
  }