Add blackbox benchmark VM infra

D2s_v5 (non-burstable, 2 vCPU, 8 GB) with cloud-init provisioning, CPU-pinned benchmarks, and A/B comparison scripts.
2026-05-04 18:25:19 +00:00 · 2026-04-29 03:04:59 -05:00 · 2026-04-29 03:04:59 -05:00 · b46101612d
commit b46101612d
parent f9f25538d2
2 changed files with 370 additions and 0 deletions
--- a/.codeflash/krrt7/codeflash-ai/blackbox/infra/cloud-init.yaml
+++ b/.codeflash/krrt7/codeflash-ai/blackbox/infra/cloud-init.yaml
@ -0,0 +1,256 @@
+#cloud-config
+#
+# Benchmark VM provisioning for blackbox package (codeflash-agent monorepo)
+#
+# Pure Python package -- no system-level deps beyond build tools.
+# Private repo: requires SSH agent forwarding for clone.
+#
+# Two-phase setup:
+#   Phase 1 (cloud-init): packages, hyperfine, uv
+#   Phase 2 (manual):     ssh -A, clone, uv sync, baseline benchmarks
+#
+# Usage:
+#   az vm create ... --custom-data infra/cloud-init.yaml
+#   bash infra/vm-manage.sh ssh
+#   bash ~/setup.sh
+#
+# VM: Azure Standard_D2s_v5 (2 vCPU, 8 GB RAM, general-purpose)
+#   Smallest non-burstable option -- blackbox is CPU-bound, not memory-bound.
+#   Use taskset -c 0 to pin benchmarks to 1 core for consistent results.
+#   Non-burstable ensures consistent CPU -- no credit depletion or turbo variability.
+
+package_update: true
+packages:
+  - git
+  - build-essential
+  - curl
+
+write_files:
+  # --- Benchmark: blackbox functions (main vs branch) ---
+  - path: /home/azureuser/bench/bench_blackbox.sh
+    owner: azureuser:azureuser
+    permissions: "0755"
+    defer: true
+    content: |
+      #!/usr/bin/env bash
+      set -euo pipefail
+      BRANCH="${1:-HEAD}"
+      PYTHON=.venv/bin/python
+      cd ~/codeflash-agent
+
+      echo "=== Benchmarking blackbox: $BRANCH ==="
+      git fetch origin
+      git checkout "$BRANCH"
+      uv sync
+
+      taskset -c 0 $PYTHON /home/azureuser/bench/bench_functions.py \
+        ~/codeflash-agent
+
+  # --- Benchmark: A/B comparison ---
+  - path: /home/azureuser/bench/bench_ab.sh
+    owner: azureuser:azureuser
+    permissions: "0755"
+    defer: true
+    content: |
+      #!/usr/bin/env bash
+      set -euo pipefail
+      BASE="${1:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
+      OPT="${2:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
+
+      echo "=== A/B comparison: $BASE vs $OPT ==="
+      echo ""
+      echo "--- BASELINE: $BASE ---"
+      bash ~/bench/bench_blackbox.sh "$BASE"
+      echo ""
+      echo "--- OPTIMIZED: $OPT ---"
+      bash ~/bench/bench_blackbox.sh "$OPT"
+
+  # --- Benchmark: Python script for per-function timing ---
+  - path: /home/azureuser/bench/bench_functions.py
+    owner: azureuser:azureuser
+    permissions: "0644"
+    defer: true
+    content: |
+      """Benchmark blackbox hot-path functions -- min-of-5 runs per function."""
+      from __future__ import annotations
+
+      import inspect
+      import json
+      import sys
+      import tempfile
+      import timeit
+      from collections import Counter
+      from pathlib import Path
+
+      sys.path.insert(0, str(Path(sys.argv[1]) / "packages/blackbox/src"))
+
+      from blackbox.models import (
+          CODEFLASH_AGENT_PREFIXES,
+          CODEFLASH_COMMANDS,
+          CODEFLASH_SKILLS,
+          LogEntry,
+      )
+
+
+      def _build_transcript(n_lines: int = 500) -> Path:
+          entries = []
+          for i in range(n_lines):
+              ts = f"2025-01-15T10:{i // 60:02d}:{i % 60:02d}Z"
+              if i % 3 == 0:
+                  entries.append(json.dumps({
+                      "type": "user", "timestamp": ts, "sessionId": "sess-bench",
+                      "cwd": "/home/user/project", "gitBranch": "feature-x",
+                      "message": {"content": f"User message {i}"},
+                  }))
+              elif i % 3 == 1:
+                  entries.append(json.dumps({
+                      "type": "assistant", "timestamp": ts,
+                      "message": {
+                          "content": [
+                              {"type": "text", "text": f"Step {i}."},
+                              {"type": "tool_use", "id": f"tool_{i}", "name": "Write",
+                               "input": {"file_path": f"/project/mod_{i % 10}.py",
+                                         "content": "def f():\n    pass\n"}},
+                              {"type": "tool_use", "id": f"tool_{i}b", "name": "Bash",
+                               "input": {"command": f"git commit -m 'step {i}'"}},
+                          ],
+                          "usage": {"input_tokens": 1000, "output_tokens": 200,
+                                    "cache_read_input_tokens": 50,
+                                    "cache_creation_input_tokens": 25},
+                      },
+                  }))
+              else:
+                  entries.append(json.dumps({
+                      "type": "user", "timestamp": ts,
+                      "message": {"content": [
+                          {"type": "tool_result", "tool_use_id": f"tool_{i - 1}",
+                           "is_error": i % 15 == 0,
+                           "content": "OK" if i % 15 != 0 else "Error: exit code 1"}
+                      ]},
+                  }))
+          tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode="w")
+          tmp.write("\n".join(entries))
+          tmp.close()
+          return Path(tmp.name)
+
+
+      def _build_log_entries(n: int = 200) -> list[LogEntry]:
+          entries = []
+          levels = ["assistant", "tool_call", "tool_result", "status", "error", "info"]
+          for i in range(n):
+              lvl = levels[i % len(levels)]
+              entries.append(LogEntry(
+                  timestamp=1700000000.0 + i,
+                  source="claude" if lvl in ("assistant", "tool_call") else "user",
+                  level=lvl,
+                  message=f"Sample message {i} with /path/to/some/file.py content",
+                  data={"tool_name": "Write", "preview": "edit file.py"}
+                        if lvl == "tool_call" else {},
+              ))
+          return entries
+
+
+      def best_of(fn, rounds: int = 5) -> float:
+          return min(fn() for _ in range(rounds))
+
+
+      def main() -> None:
+          transcript = _build_transcript(500)
+          entries = _build_log_entries(200)
+
+          from blackbox.analytics import extract_meta, track_file_changes
+          from blackbox.dashboard.transcript import parse_transcript
+          from blackbox.dashboard.rendering import render_log_html
+
+          sig = inspect.signature(track_file_changes)
+          has_languages = "languages" in sig.parameters
+
+          tool_inputs = [
+              ("Write", {"file_path": "/project/src/main.py", "content": "x = 1\ny = 2\n"}),
+              ("Edit", {"file_path": "/project/src/utils.py", "old_string": "a", "new_string": "b"}),
+              ("Write", {"file_path": "/project/README.md", "content": "hello"}),
+              ("Write", {"file_path": "/project/Makefile", "content": "all:"}),
+              ("Write", {"file_path": "/project/src/app.tsx", "content": "export default () => {}"}),
+          ]
+          test_vals = [
+              "codeflash-python", "codeflash", "random-agent", "codeflash-review",
+              "/optimize", "/status", "unknown-cmd", "/benchmark",
+              "other-prefix", "codeflash-researcher",
+          ]
+
+          # Warmup
+          extract_meta(transcript)
+          parse_transcript(transcript)
+          for e in entries:
+              render_log_html(e)
+
+          t1 = best_of(lambda: timeit.timeit(
+              lambda: extract_meta(transcript), number=200) / 200)
+
+          def _track():
+              f: set[str] = set()
+              langs: Counter[str] = Counter()
+              for tn, ti in tool_inputs:
+                  if has_languages:
+                      track_file_changes(tn, ti, f, langs)
+                  else:
+                      track_file_changes(tn, ti, f)
+          t2 = best_of(lambda: timeit.timeit(_track, number=10000) / 10000)
+
+          t3 = best_of(lambda: timeit.timeit(
+              lambda: parse_transcript(transcript), number=200) / 200)
+
+          def _render():
+              for e in entries:
+                  render_log_html(e)
+          t4 = best_of(lambda: timeit.timeit(_render, number=1000) / 1000)
+
+          def _member():
+              for v in test_vals:
+                  _ = v in CODEFLASH_AGENT_PREFIXES
+                  _ = v in CODEFLASH_SKILLS
+                  _ = v in CODEFLASH_COMMANDS
+          t5 = best_of(lambda: timeit.timeit(_member, number=100000) / 100000)
+
+          print(f"extract_meta        {t1*1000:.4f} ms")
+          print(f"track_file_changes  {t2*1000:.4f} ms")
+          print(f"parse_transcript    {t3*1000:.4f} ms")
+          print(f"render_log_html     {t4*1000:.4f} ms")
+          print(f"membership          {t5*1000:.6f} ms")
+          print(f"TOTAL               {(t1+t2+t3+t4+t5)*1000:.4f} ms")
+
+          transcript.unlink(missing_ok=True)
+
+
+      if __name__ == "__main__":
+          main()
+
+  # --- Post-provision setup (run manually after ssh -A) ---
+  - path: /home/azureuser/setup.sh
+    owner: azureuser:azureuser
+    permissions: "0755"
+    defer: true
+    content: |
+      #!/usr/bin/env bash
+      set -euo pipefail
+      export PATH="$HOME/.local/bin:$PATH"
+
+      echo "=== Cloning codeflash-agent ==="
+      git clone git@github.com:codeflash-ai/codeflash-agent.git ~/codeflash-agent
+      cd ~/codeflash-agent
+
+      echo "=== Installing dependencies ==="
+      uv sync
+
+      echo "=== Creating results directory ==="
+      mkdir -p ~/results
+
+      echo "=== Verifying installation ==="
+      .venv/bin/python -c 'from blackbox.models import LogEntry; print("OK")'
+
+      echo "=== Done ==="
+
+runcmd:
+  - wget -q https://github.com/sharkdp/hyperfine/releases/download/v1.19.0/hyperfine_1.19.0_amd64.deb -O /tmp/hyperfine.deb
+  - dpkg -i /tmp/hyperfine.deb
+  - su - azureuser -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'
--- a/.codeflash/krrt7/codeflash-ai/blackbox/infra/vm-manage.sh
+++ b/.codeflash/krrt7/codeflash-ai/blackbox/infra/vm-manage.sh
@ -0,0 +1,114 @@
+#!/usr/bin/env bash
+#
+# Azure benchmark VM lifecycle management for blackbox package
+#
+# Usage:
+#   bash infra/vm-manage.sh {create|start|stop|ip|ssh|bench <branch>|ab <base> <opt>|destroy}
+
+set -euo pipefail
+
+RG="blackbox-BENCH-RG"
+VM="blackbox-bench"
+REGION="westus2"
+SIZE="Standard_D2s_v5"
+IMAGE="Canonical:ubuntu-24_04-lts:server:latest"
+SSH_KEY="${SSH_KEY:-$HOME/.ssh/id_ed25519.pub}"
+
+case "${1:-help}" in
+  create)
+    if [ ! -f "$SSH_KEY" ]; then
+      echo "Error: SSH public key not found at $SSH_KEY"
+      echo "Generate one: ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519"
+      echo "Or set SSH_KEY=/path/to/key.pub"
+      exit 1
+    fi
+
+    echo "Creating resource group..."
+    az group create --name "$RG" --location "$REGION" --only-show-errors --output none
+
+    echo "Creating VM (Trusted Launch, SSH-only, locked-down NSG)..."
+    az vm create \
+      --resource-group "$RG" \
+      --name "$VM" \
+      --image "$IMAGE" \
+      --size "$SIZE" \
+      --os-disk-size-gb 32 \
+      --admin-username azureuser \
+      --ssh-key-values "$SSH_KEY" \
+      --authentication-type ssh \
+      --security-type TrustedLaunch \
+      --enable-secure-boot true \
+      --enable-vtpm true \
+      --nsg-rule NONE \
+      --custom-data infra/cloud-init.yaml \
+      --only-show-errors
+
+    MY_IP=$(curl -s ifconfig.me)
+    echo "Restricting SSH to $MY_IP..."
+    az network nsg rule create \
+      --resource-group "$RG" \
+      --nsg-name "${VM}NSG" \
+      --name AllowSSHFromMyIP \
+      --priority 1000 \
+      --source-address-prefixes "$MY_IP/32" \
+      --destination-port-ranges 22 \
+      --access Allow \
+      --protocol Tcp \
+      --output none
+
+    echo "VM created. Get IP with: $0 ip"
+    ;;
+
+  start)
+    echo "Starting VM..."
+    az vm start --resource-group "$RG" --name "$VM"
+    echo "Started. IP: $(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)"
+    ;;
+
+  stop)
+    echo "Deallocating VM (stops billing)..."
+    az vm deallocate --resource-group "$RG" --name "$VM"
+    echo "Deallocated."
+    ;;
+
+  ip)
+    az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv
+    ;;
+
+  ssh)
+    IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
+    ssh -A azureuser@"$IP" "${@:2}"
+    ;;
+
+  bench)
+    BRANCH="${2:?Usage: $0 bench <branch>}"
+    IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
+    ssh -A azureuser@"$IP" "bash ~/bench/bench_blackbox.sh $BRANCH"
+    ;;
+
+  ab)
+    BASE="${2:?Usage: $0 ab <base-branch> <opt-branch>}"
+    OPT="${3:?Usage: $0 ab <base-branch> <opt-branch>}"
+    IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
+    ssh -A azureuser@"$IP" "bash ~/bench/bench_ab.sh $BASE $OPT"
+    ;;
+
+  destroy)
+    echo "Destroying resource group (all resources)..."
+    az group delete --name "$RG" --yes --no-wait
+    echo "Deletion started."
+    ;;
+
+  help|*)
+    echo "Usage: $0 {create|start|stop|ip|ssh|bench <branch>|ab <base> <opt>|destroy}"
+    echo ""
+    echo "  create   - Provision VM with cloud-init"
+    echo "  start    - Start deallocated VM"
+    echo "  stop     - Deallocate VM (stops billing)"
+    echo "  ip       - Show VM public IP"
+    echo "  ssh      - SSH into VM with agent forwarding"
+    echo "  bench    - Run benchmarks on a branch"
+    echo "  ab       - A/B comparison between two branches"
+    echo "  destroy  - Delete resource group and all resources"
+    ;;
+esac