codeflash-agent/.codeflash/krrt7/codeflash-ai/blackbox/infra/cloud-init.yaml

#cloud-config
#
# Benchmark VM provisioning for blackbox package (codeflash-agent monorepo)
#
# Pure Python package -- no system-level deps beyond build tools.
# Private repo: requires SSH agent forwarding for clone.
#
# Two-phase setup:
#   Phase 1 (cloud-init): packages, hyperfine, uv
#   Phase 2 (manual):     ssh -A, clone, uv sync, baseline benchmarks
#
# Usage:
#   az vm create ... --custom-data infra/cloud-init.yaml
#   bash infra/vm-manage.sh ssh
#   bash ~/setup.sh
#
# VM: Azure Standard_D2s_v5 (2 vCPU, 8 GB RAM, general-purpose)
#   Smallest non-burstable option -- blackbox is CPU-bound, not memory-bound.
#   Use taskset -c 0 to pin benchmarks to 1 core for consistent results.
#   Non-burstable ensures consistent CPU -- no credit depletion or turbo variability.

package_update: true
packages:
  - git
  - build-essential
  - curl

write_files:
  # --- Benchmark: blackbox functions (main vs branch) ---
  - path: /home/azureuser/bench/bench_blackbox.sh
    owner: azureuser:azureuser
    permissions: "0755"
    defer: true
    content: |
      #!/usr/bin/env bash
      set -euo pipefail
      BRANCH="${1:-HEAD}"
      PYTHON=.venv/bin/python
      cd ~/codeflash-agent

      echo "=== Benchmarking blackbox: $BRANCH ==="
      git fetch origin
      git checkout "$BRANCH"
      uv sync

      taskset -c 0 $PYTHON /home/azureuser/bench/bench_functions.py \
        ~/codeflash-agent

  # --- Benchmark: A/B comparison ---
  - path: /home/azureuser/bench/bench_ab.sh
    owner: azureuser:azureuser
    permissions: "0755"
    defer: true
    content: |
      #!/usr/bin/env bash
      set -euo pipefail
      BASE="${1:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
      OPT="${2:?Usage: bench_ab.sh <base-branch> <opt-branch>}"

      echo "=== A/B comparison: $BASE vs $OPT ==="
      echo ""
      echo "--- BASELINE: $BASE ---"
      bash ~/bench/bench_blackbox.sh "$BASE"
      echo ""
      echo "--- OPTIMIZED: $OPT ---"
      bash ~/bench/bench_blackbox.sh "$OPT"

  # --- Benchmark: Python script for per-function timing ---
  - path: /home/azureuser/bench/bench_functions.py
    owner: azureuser:azureuser
    permissions: "0644"
    defer: true
    content: |
      """Benchmark blackbox hot-path functions -- min-of-5 runs per function."""
      from __future__ import annotations

      import inspect
      import json
      import sys
      import tempfile
      import timeit
      from collections import Counter
      from pathlib import Path

      sys.path.insert(0, str(Path(sys.argv[1]) / "packages/blackbox/src"))

      from blackbox.models import (
          CODEFLASH_AGENT_PREFIXES,
          CODEFLASH_COMMANDS,
          CODEFLASH_SKILLS,
          LogEntry,
      )


      def _build_transcript(n_lines: int = 500) -> Path:
          entries = []
          for i in range(n_lines):
              ts = f"2025-01-15T10:{i // 60:02d}:{i % 60:02d}Z"
              if i % 3 == 0:
                  entries.append(json.dumps({
                      "type": "user", "timestamp": ts, "sessionId": "sess-bench",
                      "cwd": "/home/user/project", "gitBranch": "feature-x",
                      "message": {"content": f"User message {i}"},
                  }))
              elif i % 3 == 1:
                  entries.append(json.dumps({
                      "type": "assistant", "timestamp": ts,
                      "message": {
                          "content": [
                              {"type": "text", "text": f"Step {i}."},
                              {"type": "tool_use", "id": f"tool_{i}", "name": "Write",
                               "input": {"file_path": f"/project/mod_{i % 10}.py",
                                         "content": "def f():\n    pass\n"}},
                              {"type": "tool_use", "id": f"tool_{i}b", "name": "Bash",
                               "input": {"command": f"git commit -m 'step {i}'"}},
                          ],
                          "usage": {"input_tokens": 1000, "output_tokens": 200,
                                    "cache_read_input_tokens": 50,
                                    "cache_creation_input_tokens": 25},
                      },
                  }))
              else:
                  entries.append(json.dumps({
                      "type": "user", "timestamp": ts,
                      "message": {"content": [
                          {"type": "tool_result", "tool_use_id": f"tool_{i - 1}",
                           "is_error": i % 15 == 0,
                           "content": "OK" if i % 15 != 0 else "Error: exit code 1"}
                      ]},
                  }))
          tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode="w")
          tmp.write("\n".join(entries))
          tmp.close()
          return Path(tmp.name)


      def _build_log_entries(n: int = 200) -> list[LogEntry]:
          entries = []
          levels = ["assistant", "tool_call", "tool_result", "status", "error", "info"]
          for i in range(n):
              lvl = levels[i % len(levels)]
              entries.append(LogEntry(
                  timestamp=1700000000.0 + i,
                  source="claude" if lvl in ("assistant", "tool_call") else "user",
                  level=lvl,
                  message=f"Sample message {i} with /path/to/some/file.py content",
                  data={"tool_name": "Write", "preview": "edit file.py"}
                        if lvl == "tool_call" else {},
              ))
          return entries


      def best_of(fn, rounds: int = 5) -> float:
          return min(fn() for _ in range(rounds))


      def main() -> None:
          transcript = _build_transcript(500)
          entries = _build_log_entries(200)

          from blackbox.analytics import extract_meta, track_file_changes
          from blackbox.dashboard.transcript import parse_transcript
          from blackbox.dashboard.rendering import render_log_html

          sig = inspect.signature(track_file_changes)
          has_languages = "languages" in sig.parameters

          tool_inputs = [
              ("Write", {"file_path": "/project/src/main.py", "content": "x = 1\ny = 2\n"}),
              ("Edit", {"file_path": "/project/src/utils.py", "old_string": "a", "new_string": "b"}),
              ("Write", {"file_path": "/project/README.md", "content": "hello"}),
              ("Write", {"file_path": "/project/Makefile", "content": "all:"}),
              ("Write", {"file_path": "/project/src/app.tsx", "content": "export default () => {}"}),
          ]
          test_vals = [
              "codeflash-python", "codeflash", "random-agent", "codeflash-review",
              "/optimize", "/status", "unknown-cmd", "/benchmark",
              "other-prefix", "codeflash-researcher",
          ]

          # Warmup
          extract_meta(transcript)
          parse_transcript(transcript)
          for e in entries:
              render_log_html(e)

          t1 = best_of(lambda: timeit.timeit(
              lambda: extract_meta(transcript), number=200) / 200)

          def _track():
              f: set[str] = set()
              langs: Counter[str] = Counter()
              for tn, ti in tool_inputs:
                  if has_languages:
                      track_file_changes(tn, ti, f, langs)
                  else:
                      track_file_changes(tn, ti, f)
          t2 = best_of(lambda: timeit.timeit(_track, number=10000) / 10000)

          t3 = best_of(lambda: timeit.timeit(
              lambda: parse_transcript(transcript), number=200) / 200)

          def _render():
              for e in entries:
                  render_log_html(e)
          t4 = best_of(lambda: timeit.timeit(_render, number=1000) / 1000)

          def _member():
              for v in test_vals:
                  _ = v in CODEFLASH_AGENT_PREFIXES
                  _ = v in CODEFLASH_SKILLS
                  _ = v in CODEFLASH_COMMANDS
          t5 = best_of(lambda: timeit.timeit(_member, number=100000) / 100000)

          print(f"extract_meta        {t1*1000:.4f} ms")
          print(f"track_file_changes  {t2*1000:.4f} ms")
          print(f"parse_transcript    {t3*1000:.4f} ms")
          print(f"render_log_html     {t4*1000:.4f} ms")
          print(f"membership          {t5*1000:.6f} ms")
          print(f"TOTAL               {(t1+t2+t3+t4+t5)*1000:.4f} ms")

          transcript.unlink(missing_ok=True)


      if __name__ == "__main__":
          main()

  # --- Post-provision setup (run manually after ssh -A) ---
  - path: /home/azureuser/setup.sh
    owner: azureuser:azureuser
    permissions: "0755"
    defer: true
    content: |
      #!/usr/bin/env bash
      set -euo pipefail
      export PATH="$HOME/.local/bin:$PATH"

      echo "=== Cloning codeflash-agent ==="
      git clone git@github.com:codeflash-ai/codeflash-agent.git ~/codeflash-agent
      cd ~/codeflash-agent

      echo "=== Installing dependencies ==="
      uv sync

      echo "=== Creating results directory ==="
      mkdir -p ~/results

      echo "=== Verifying installation ==="
      .venv/bin/python -c 'from blackbox.models import LogEntry; print("OK")'

      echo "=== Done ==="

runcmd:
  - wget -q https://github.com/sharkdp/hyperfine/releases/download/v1.19.0/hyperfine_1.19.0_amd64.deb -O /tmp/hyperfine.deb
  - dpkg -i /tmp/hyperfine.deb
  - su - azureuser -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'