Add blackbox benchmark VM infra

D2s_v5 (non-burstable, 2 vCPU, 8 GB) with cloud-init provisioning,
CPU-pinned benchmarks, and A/B comparison scripts.
This commit is contained in:
Kevin Turcios 2026-04-29 03:04:59 -05:00
parent 5f3bb4dba8
commit 1da8b41b66
2 changed files with 370 additions and 0 deletions

View file

@ -0,0 +1,256 @@
#cloud-config
#
# Benchmark VM provisioning for blackbox package (codeflash-agent monorepo)
#
# Pure Python package -- no system-level deps beyond build tools.
# Private repo: requires SSH agent forwarding for clone.
#
# Two-phase setup:
# Phase 1 (cloud-init): packages, hyperfine, uv
# Phase 2 (manual): ssh -A, clone, uv sync, baseline benchmarks
#
# Usage:
# az vm create ... --custom-data infra/cloud-init.yaml
# bash infra/vm-manage.sh ssh
# bash ~/setup.sh
#
# VM: Azure Standard_D2s_v5 (2 vCPU, 8 GB RAM, general-purpose)
# Smallest non-burstable option -- blackbox is CPU-bound, not memory-bound.
# Use taskset -c 0 to pin benchmarks to 1 core for consistent results.
# Non-burstable ensures consistent CPU -- no credit depletion or turbo variability.
package_update: true
packages:
- git
- build-essential
- curl
write_files:
# --- Benchmark: blackbox functions (main vs branch) ---
- path: /home/azureuser/bench/bench_blackbox.sh
owner: azureuser:azureuser
permissions: "0755"
defer: true
content: |
#!/usr/bin/env bash
set -euo pipefail
BRANCH="${1:-HEAD}"
PYTHON=.venv/bin/python
cd ~/codeflash-agent
echo "=== Benchmarking blackbox: $BRANCH ==="
git fetch origin
git checkout "$BRANCH"
uv sync
taskset -c 0 $PYTHON /home/azureuser/bench/bench_functions.py \
~/codeflash-agent
# --- Benchmark: A/B comparison ---
- path: /home/azureuser/bench/bench_ab.sh
owner: azureuser:azureuser
permissions: "0755"
defer: true
content: |
#!/usr/bin/env bash
set -euo pipefail
BASE="${1:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
OPT="${2:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
echo "=== A/B comparison: $BASE vs $OPT ==="
echo ""
echo "--- BASELINE: $BASE ---"
bash ~/bench/bench_blackbox.sh "$BASE"
echo ""
echo "--- OPTIMIZED: $OPT ---"
bash ~/bench/bench_blackbox.sh "$OPT"
# --- Benchmark: Python script for per-function timing ---
- path: /home/azureuser/bench/bench_functions.py
owner: azureuser:azureuser
permissions: "0644"
defer: true
content: |
"""Benchmark blackbox hot-path functions -- min-of-5 runs per function."""
from __future__ import annotations
import inspect
import json
import sys
import tempfile
import timeit
from collections import Counter
from pathlib import Path
sys.path.insert(0, str(Path(sys.argv[1]) / "packages/blackbox/src"))
from blackbox.models import (
CODEFLASH_AGENT_PREFIXES,
CODEFLASH_COMMANDS,
CODEFLASH_SKILLS,
LogEntry,
)
def _build_transcript(n_lines: int = 500) -> Path:
entries = []
for i in range(n_lines):
ts = f"2025-01-15T10:{i // 60:02d}:{i % 60:02d}Z"
if i % 3 == 0:
entries.append(json.dumps({
"type": "user", "timestamp": ts, "sessionId": "sess-bench",
"cwd": "/home/user/project", "gitBranch": "feature-x",
"message": {"content": f"User message {i}"},
}))
elif i % 3 == 1:
entries.append(json.dumps({
"type": "assistant", "timestamp": ts,
"message": {
"content": [
{"type": "text", "text": f"Step {i}."},
{"type": "tool_use", "id": f"tool_{i}", "name": "Write",
"input": {"file_path": f"/project/mod_{i % 10}.py",
"content": "def f():\n pass\n"}},
{"type": "tool_use", "id": f"tool_{i}b", "name": "Bash",
"input": {"command": f"git commit -m 'step {i}'"}},
],
"usage": {"input_tokens": 1000, "output_tokens": 200,
"cache_read_input_tokens": 50,
"cache_creation_input_tokens": 25},
},
}))
else:
entries.append(json.dumps({
"type": "user", "timestamp": ts,
"message": {"content": [
{"type": "tool_result", "tool_use_id": f"tool_{i - 1}",
"is_error": i % 15 == 0,
"content": "OK" if i % 15 != 0 else "Error: exit code 1"}
]},
}))
tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode="w")
tmp.write("\n".join(entries))
tmp.close()
return Path(tmp.name)
def _build_log_entries(n: int = 200) -> list[LogEntry]:
entries = []
levels = ["assistant", "tool_call", "tool_result", "status", "error", "info"]
for i in range(n):
lvl = levels[i % len(levels)]
entries.append(LogEntry(
timestamp=1700000000.0 + i,
source="claude" if lvl in ("assistant", "tool_call") else "user",
level=lvl,
message=f"Sample message {i} with /path/to/some/file.py content",
data={"tool_name": "Write", "preview": "edit file.py"}
if lvl == "tool_call" else {},
))
return entries
def best_of(fn, rounds: int = 5) -> float:
return min(fn() for _ in range(rounds))
def main() -> None:
transcript = _build_transcript(500)
entries = _build_log_entries(200)
from blackbox.analytics import extract_meta, track_file_changes
from blackbox.dashboard.transcript import parse_transcript
from blackbox.dashboard.rendering import render_log_html
sig = inspect.signature(track_file_changes)
has_languages = "languages" in sig.parameters
tool_inputs = [
("Write", {"file_path": "/project/src/main.py", "content": "x = 1\ny = 2\n"}),
("Edit", {"file_path": "/project/src/utils.py", "old_string": "a", "new_string": "b"}),
("Write", {"file_path": "/project/README.md", "content": "hello"}),
("Write", {"file_path": "/project/Makefile", "content": "all:"}),
("Write", {"file_path": "/project/src/app.tsx", "content": "export default () => {}"}),
]
test_vals = [
"codeflash-python", "codeflash", "random-agent", "codeflash-review",
"/optimize", "/status", "unknown-cmd", "/benchmark",
"other-prefix", "codeflash-researcher",
]
# Warmup
extract_meta(transcript)
parse_transcript(transcript)
for e in entries:
render_log_html(e)
t1 = best_of(lambda: timeit.timeit(
lambda: extract_meta(transcript), number=200) / 200)
def _track():
f: set[str] = set()
langs: Counter[str] = Counter()
for tn, ti in tool_inputs:
if has_languages:
track_file_changes(tn, ti, f, langs)
else:
track_file_changes(tn, ti, f)
t2 = best_of(lambda: timeit.timeit(_track, number=10000) / 10000)
t3 = best_of(lambda: timeit.timeit(
lambda: parse_transcript(transcript), number=200) / 200)
def _render():
for e in entries:
render_log_html(e)
t4 = best_of(lambda: timeit.timeit(_render, number=1000) / 1000)
def _member():
for v in test_vals:
_ = v in CODEFLASH_AGENT_PREFIXES
_ = v in CODEFLASH_SKILLS
_ = v in CODEFLASH_COMMANDS
t5 = best_of(lambda: timeit.timeit(_member, number=100000) / 100000)
print(f"extract_meta {t1*1000:.4f} ms")
print(f"track_file_changes {t2*1000:.4f} ms")
print(f"parse_transcript {t3*1000:.4f} ms")
print(f"render_log_html {t4*1000:.4f} ms")
print(f"membership {t5*1000:.6f} ms")
print(f"TOTAL {(t1+t2+t3+t4+t5)*1000:.4f} ms")
transcript.unlink(missing_ok=True)
if __name__ == "__main__":
main()
# --- Post-provision setup (run manually after ssh -A) ---
- path: /home/azureuser/setup.sh
owner: azureuser:azureuser
permissions: "0755"
defer: true
content: |
#!/usr/bin/env bash
set -euo pipefail
export PATH="$HOME/.local/bin:$PATH"
echo "=== Cloning codeflash-agent ==="
git clone git@github.com:codeflash-ai/codeflash-agent.git ~/codeflash-agent
cd ~/codeflash-agent
echo "=== Installing dependencies ==="
uv sync
echo "=== Creating results directory ==="
mkdir -p ~/results
echo "=== Verifying installation ==="
.venv/bin/python -c 'from blackbox.models import LogEntry; print("OK")'
echo "=== Done ==="
runcmd:
- wget -q https://github.com/sharkdp/hyperfine/releases/download/v1.19.0/hyperfine_1.19.0_amd64.deb -O /tmp/hyperfine.deb
- dpkg -i /tmp/hyperfine.deb
- su - azureuser -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'

View file

@ -0,0 +1,114 @@
#!/usr/bin/env bash
#
# Azure benchmark VM lifecycle management for blackbox package
#
# Usage:
# bash infra/vm-manage.sh {create|start|stop|ip|ssh|bench <branch>|ab <base> <opt>|destroy}
set -euo pipefail
RG="blackbox-BENCH-RG"
VM="blackbox-bench"
REGION="westus2"
SIZE="Standard_D2s_v5"
IMAGE="Canonical:ubuntu-24_04-lts:server:latest"
SSH_KEY="${SSH_KEY:-$HOME/.ssh/id_ed25519.pub}"
case "${1:-help}" in
create)
if [ ! -f "$SSH_KEY" ]; then
echo "Error: SSH public key not found at $SSH_KEY"
echo "Generate one: ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519"
echo "Or set SSH_KEY=/path/to/key.pub"
exit 1
fi
echo "Creating resource group..."
az group create --name "$RG" --location "$REGION" --only-show-errors --output none
echo "Creating VM (Trusted Launch, SSH-only, locked-down NSG)..."
az vm create \
--resource-group "$RG" \
--name "$VM" \
--image "$IMAGE" \
--size "$SIZE" \
--os-disk-size-gb 32 \
--admin-username azureuser \
--ssh-key-values "$SSH_KEY" \
--authentication-type ssh \
--security-type TrustedLaunch \
--enable-secure-boot true \
--enable-vtpm true \
--nsg-rule NONE \
--custom-data infra/cloud-init.yaml \
--only-show-errors
MY_IP=$(curl -s ifconfig.me)
echo "Restricting SSH to $MY_IP..."
az network nsg rule create \
--resource-group "$RG" \
--nsg-name "${VM}NSG" \
--name AllowSSHFromMyIP \
--priority 1000 \
--source-address-prefixes "$MY_IP/32" \
--destination-port-ranges 22 \
--access Allow \
--protocol Tcp \
--output none
echo "VM created. Get IP with: $0 ip"
;;
start)
echo "Starting VM..."
az vm start --resource-group "$RG" --name "$VM"
echo "Started. IP: $(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)"
;;
stop)
echo "Deallocating VM (stops billing)..."
az vm deallocate --resource-group "$RG" --name "$VM"
echo "Deallocated."
;;
ip)
az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv
;;
ssh)
IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
ssh -A azureuser@"$IP" "${@:2}"
;;
bench)
BRANCH="${2:?Usage: $0 bench <branch>}"
IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
ssh -A azureuser@"$IP" "bash ~/bench/bench_blackbox.sh $BRANCH"
;;
ab)
BASE="${2:?Usage: $0 ab <base-branch> <opt-branch>}"
OPT="${3:?Usage: $0 ab <base-branch> <opt-branch>}"
IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
ssh -A azureuser@"$IP" "bash ~/bench/bench_ab.sh $BASE $OPT"
;;
destroy)
echo "Destroying resource group (all resources)..."
az group delete --name "$RG" --yes --no-wait
echo "Deletion started."
;;
help|*)
echo "Usage: $0 {create|start|stop|ip|ssh|bench <branch>|ab <base> <opt>|destroy}"
echo ""
echo " create - Provision VM with cloud-init"
echo " start - Start deallocated VM"
echo " stop - Deallocate VM (stops billing)"
echo " ip - Show VM public IP"
echo " ssh - SSH into VM with agent forwarding"
echo " bench - Run benchmarks on a branch"
echo " ab - A/B comparison between two branches"
echo " destroy - Delete resource group and all resources"
;;
esac