mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
Add blackbox benchmark VM infra
D2s_v5 (non-burstable, 2 vCPU, 8 GB) with cloud-init provisioning, CPU-pinned benchmarks, and A/B comparison scripts.
This commit is contained in:
parent
f9f25538d2
commit
b46101612d
2 changed files with 370 additions and 0 deletions
256
.codeflash/krrt7/codeflash-ai/blackbox/infra/cloud-init.yaml
Normal file
256
.codeflash/krrt7/codeflash-ai/blackbox/infra/cloud-init.yaml
Normal file
|
|
@ -0,0 +1,256 @@
|
|||
#cloud-config
|
||||
#
|
||||
# Benchmark VM provisioning for blackbox package (codeflash-agent monorepo)
|
||||
#
|
||||
# Pure Python package -- no system-level deps beyond build tools.
|
||||
# Private repo: requires SSH agent forwarding for clone.
|
||||
#
|
||||
# Two-phase setup:
|
||||
# Phase 1 (cloud-init): packages, hyperfine, uv
|
||||
# Phase 2 (manual): ssh -A, clone, uv sync, baseline benchmarks
|
||||
#
|
||||
# Usage:
|
||||
# az vm create ... --custom-data infra/cloud-init.yaml
|
||||
# bash infra/vm-manage.sh ssh
|
||||
# bash ~/setup.sh
|
||||
#
|
||||
# VM: Azure Standard_D2s_v5 (2 vCPU, 8 GB RAM, general-purpose)
|
||||
# Smallest non-burstable option -- blackbox is CPU-bound, not memory-bound.
|
||||
# Use taskset -c 0 to pin benchmarks to 1 core for consistent results.
|
||||
# Non-burstable ensures consistent CPU -- no credit depletion or turbo variability.
|
||||
|
||||
package_update: true
|
||||
packages:
|
||||
- git
|
||||
- build-essential
|
||||
- curl
|
||||
|
||||
write_files:
|
||||
# --- Benchmark: blackbox functions (main vs branch) ---
|
||||
- path: /home/azureuser/bench/bench_blackbox.sh
|
||||
owner: azureuser:azureuser
|
||||
permissions: "0755"
|
||||
defer: true
|
||||
content: |
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
BRANCH="${1:-HEAD}"
|
||||
PYTHON=.venv/bin/python
|
||||
cd ~/codeflash-agent
|
||||
|
||||
echo "=== Benchmarking blackbox: $BRANCH ==="
|
||||
git fetch origin
|
||||
git checkout "$BRANCH"
|
||||
uv sync
|
||||
|
||||
taskset -c 0 $PYTHON /home/azureuser/bench/bench_functions.py \
|
||||
~/codeflash-agent
|
||||
|
||||
# --- Benchmark: A/B comparison ---
|
||||
- path: /home/azureuser/bench/bench_ab.sh
|
||||
owner: azureuser:azureuser
|
||||
permissions: "0755"
|
||||
defer: true
|
||||
content: |
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
BASE="${1:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
|
||||
OPT="${2:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
|
||||
|
||||
echo "=== A/B comparison: $BASE vs $OPT ==="
|
||||
echo ""
|
||||
echo "--- BASELINE: $BASE ---"
|
||||
bash ~/bench/bench_blackbox.sh "$BASE"
|
||||
echo ""
|
||||
echo "--- OPTIMIZED: $OPT ---"
|
||||
bash ~/bench/bench_blackbox.sh "$OPT"
|
||||
|
||||
# --- Benchmark: Python script for per-function timing ---
|
||||
- path: /home/azureuser/bench/bench_functions.py
|
||||
owner: azureuser:azureuser
|
||||
permissions: "0644"
|
||||
defer: true
|
||||
content: |
|
||||
"""Benchmark blackbox hot-path functions -- min-of-5 runs per function."""
|
||||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import sys
|
||||
import tempfile
|
||||
import timeit
|
||||
from collections import Counter
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(sys.argv[1]) / "packages/blackbox/src"))
|
||||
|
||||
from blackbox.models import (
|
||||
CODEFLASH_AGENT_PREFIXES,
|
||||
CODEFLASH_COMMANDS,
|
||||
CODEFLASH_SKILLS,
|
||||
LogEntry,
|
||||
)
|
||||
|
||||
|
||||
def _build_transcript(n_lines: int = 500) -> Path:
|
||||
entries = []
|
||||
for i in range(n_lines):
|
||||
ts = f"2025-01-15T10:{i // 60:02d}:{i % 60:02d}Z"
|
||||
if i % 3 == 0:
|
||||
entries.append(json.dumps({
|
||||
"type": "user", "timestamp": ts, "sessionId": "sess-bench",
|
||||
"cwd": "/home/user/project", "gitBranch": "feature-x",
|
||||
"message": {"content": f"User message {i}"},
|
||||
}))
|
||||
elif i % 3 == 1:
|
||||
entries.append(json.dumps({
|
||||
"type": "assistant", "timestamp": ts,
|
||||
"message": {
|
||||
"content": [
|
||||
{"type": "text", "text": f"Step {i}."},
|
||||
{"type": "tool_use", "id": f"tool_{i}", "name": "Write",
|
||||
"input": {"file_path": f"/project/mod_{i % 10}.py",
|
||||
"content": "def f():\n pass\n"}},
|
||||
{"type": "tool_use", "id": f"tool_{i}b", "name": "Bash",
|
||||
"input": {"command": f"git commit -m 'step {i}'"}},
|
||||
],
|
||||
"usage": {"input_tokens": 1000, "output_tokens": 200,
|
||||
"cache_read_input_tokens": 50,
|
||||
"cache_creation_input_tokens": 25},
|
||||
},
|
||||
}))
|
||||
else:
|
||||
entries.append(json.dumps({
|
||||
"type": "user", "timestamp": ts,
|
||||
"message": {"content": [
|
||||
{"type": "tool_result", "tool_use_id": f"tool_{i - 1}",
|
||||
"is_error": i % 15 == 0,
|
||||
"content": "OK" if i % 15 != 0 else "Error: exit code 1"}
|
||||
]},
|
||||
}))
|
||||
tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode="w")
|
||||
tmp.write("\n".join(entries))
|
||||
tmp.close()
|
||||
return Path(tmp.name)
|
||||
|
||||
|
||||
def _build_log_entries(n: int = 200) -> list[LogEntry]:
|
||||
entries = []
|
||||
levels = ["assistant", "tool_call", "tool_result", "status", "error", "info"]
|
||||
for i in range(n):
|
||||
lvl = levels[i % len(levels)]
|
||||
entries.append(LogEntry(
|
||||
timestamp=1700000000.0 + i,
|
||||
source="claude" if lvl in ("assistant", "tool_call") else "user",
|
||||
level=lvl,
|
||||
message=f"Sample message {i} with /path/to/some/file.py content",
|
||||
data={"tool_name": "Write", "preview": "edit file.py"}
|
||||
if lvl == "tool_call" else {},
|
||||
))
|
||||
return entries
|
||||
|
||||
|
||||
def best_of(fn, rounds: int = 5) -> float:
|
||||
return min(fn() for _ in range(rounds))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
transcript = _build_transcript(500)
|
||||
entries = _build_log_entries(200)
|
||||
|
||||
from blackbox.analytics import extract_meta, track_file_changes
|
||||
from blackbox.dashboard.transcript import parse_transcript
|
||||
from blackbox.dashboard.rendering import render_log_html
|
||||
|
||||
sig = inspect.signature(track_file_changes)
|
||||
has_languages = "languages" in sig.parameters
|
||||
|
||||
tool_inputs = [
|
||||
("Write", {"file_path": "/project/src/main.py", "content": "x = 1\ny = 2\n"}),
|
||||
("Edit", {"file_path": "/project/src/utils.py", "old_string": "a", "new_string": "b"}),
|
||||
("Write", {"file_path": "/project/README.md", "content": "hello"}),
|
||||
("Write", {"file_path": "/project/Makefile", "content": "all:"}),
|
||||
("Write", {"file_path": "/project/src/app.tsx", "content": "export default () => {}"}),
|
||||
]
|
||||
test_vals = [
|
||||
"codeflash-python", "codeflash", "random-agent", "codeflash-review",
|
||||
"/optimize", "/status", "unknown-cmd", "/benchmark",
|
||||
"other-prefix", "codeflash-researcher",
|
||||
]
|
||||
|
||||
# Warmup
|
||||
extract_meta(transcript)
|
||||
parse_transcript(transcript)
|
||||
for e in entries:
|
||||
render_log_html(e)
|
||||
|
||||
t1 = best_of(lambda: timeit.timeit(
|
||||
lambda: extract_meta(transcript), number=200) / 200)
|
||||
|
||||
def _track():
|
||||
f: set[str] = set()
|
||||
langs: Counter[str] = Counter()
|
||||
for tn, ti in tool_inputs:
|
||||
if has_languages:
|
||||
track_file_changes(tn, ti, f, langs)
|
||||
else:
|
||||
track_file_changes(tn, ti, f)
|
||||
t2 = best_of(lambda: timeit.timeit(_track, number=10000) / 10000)
|
||||
|
||||
t3 = best_of(lambda: timeit.timeit(
|
||||
lambda: parse_transcript(transcript), number=200) / 200)
|
||||
|
||||
def _render():
|
||||
for e in entries:
|
||||
render_log_html(e)
|
||||
t4 = best_of(lambda: timeit.timeit(_render, number=1000) / 1000)
|
||||
|
||||
def _member():
|
||||
for v in test_vals:
|
||||
_ = v in CODEFLASH_AGENT_PREFIXES
|
||||
_ = v in CODEFLASH_SKILLS
|
||||
_ = v in CODEFLASH_COMMANDS
|
||||
t5 = best_of(lambda: timeit.timeit(_member, number=100000) / 100000)
|
||||
|
||||
print(f"extract_meta {t1*1000:.4f} ms")
|
||||
print(f"track_file_changes {t2*1000:.4f} ms")
|
||||
print(f"parse_transcript {t3*1000:.4f} ms")
|
||||
print(f"render_log_html {t4*1000:.4f} ms")
|
||||
print(f"membership {t5*1000:.6f} ms")
|
||||
print(f"TOTAL {(t1+t2+t3+t4+t5)*1000:.4f} ms")
|
||||
|
||||
transcript.unlink(missing_ok=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
# --- Post-provision setup (run manually after ssh -A) ---
|
||||
- path: /home/azureuser/setup.sh
|
||||
owner: azureuser:azureuser
|
||||
permissions: "0755"
|
||||
defer: true
|
||||
content: |
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
|
||||
echo "=== Cloning codeflash-agent ==="
|
||||
git clone git@github.com:codeflash-ai/codeflash-agent.git ~/codeflash-agent
|
||||
cd ~/codeflash-agent
|
||||
|
||||
echo "=== Installing dependencies ==="
|
||||
uv sync
|
||||
|
||||
echo "=== Creating results directory ==="
|
||||
mkdir -p ~/results
|
||||
|
||||
echo "=== Verifying installation ==="
|
||||
.venv/bin/python -c 'from blackbox.models import LogEntry; print("OK")'
|
||||
|
||||
echo "=== Done ==="
|
||||
|
||||
runcmd:
|
||||
- wget -q https://github.com/sharkdp/hyperfine/releases/download/v1.19.0/hyperfine_1.19.0_amd64.deb -O /tmp/hyperfine.deb
|
||||
- dpkg -i /tmp/hyperfine.deb
|
||||
- su - azureuser -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'
|
||||
114
.codeflash/krrt7/codeflash-ai/blackbox/infra/vm-manage.sh
Normal file
114
.codeflash/krrt7/codeflash-ai/blackbox/infra/vm-manage.sh
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
# Azure benchmark VM lifecycle management for blackbox package
|
||||
#
|
||||
# Usage:
|
||||
# bash infra/vm-manage.sh {create|start|stop|ip|ssh|bench <branch>|ab <base> <opt>|destroy}
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RG="blackbox-BENCH-RG"
|
||||
VM="blackbox-bench"
|
||||
REGION="westus2"
|
||||
SIZE="Standard_D2s_v5"
|
||||
IMAGE="Canonical:ubuntu-24_04-lts:server:latest"
|
||||
SSH_KEY="${SSH_KEY:-$HOME/.ssh/id_ed25519.pub}"
|
||||
|
||||
case "${1:-help}" in
|
||||
create)
|
||||
if [ ! -f "$SSH_KEY" ]; then
|
||||
echo "Error: SSH public key not found at $SSH_KEY"
|
||||
echo "Generate one: ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519"
|
||||
echo "Or set SSH_KEY=/path/to/key.pub"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Creating resource group..."
|
||||
az group create --name "$RG" --location "$REGION" --only-show-errors --output none
|
||||
|
||||
echo "Creating VM (Trusted Launch, SSH-only, locked-down NSG)..."
|
||||
az vm create \
|
||||
--resource-group "$RG" \
|
||||
--name "$VM" \
|
||||
--image "$IMAGE" \
|
||||
--size "$SIZE" \
|
||||
--os-disk-size-gb 32 \
|
||||
--admin-username azureuser \
|
||||
--ssh-key-values "$SSH_KEY" \
|
||||
--authentication-type ssh \
|
||||
--security-type TrustedLaunch \
|
||||
--enable-secure-boot true \
|
||||
--enable-vtpm true \
|
||||
--nsg-rule NONE \
|
||||
--custom-data infra/cloud-init.yaml \
|
||||
--only-show-errors
|
||||
|
||||
MY_IP=$(curl -s ifconfig.me)
|
||||
echo "Restricting SSH to $MY_IP..."
|
||||
az network nsg rule create \
|
||||
--resource-group "$RG" \
|
||||
--nsg-name "${VM}NSG" \
|
||||
--name AllowSSHFromMyIP \
|
||||
--priority 1000 \
|
||||
--source-address-prefixes "$MY_IP/32" \
|
||||
--destination-port-ranges 22 \
|
||||
--access Allow \
|
||||
--protocol Tcp \
|
||||
--output none
|
||||
|
||||
echo "VM created. Get IP with: $0 ip"
|
||||
;;
|
||||
|
||||
start)
|
||||
echo "Starting VM..."
|
||||
az vm start --resource-group "$RG" --name "$VM"
|
||||
echo "Started. IP: $(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)"
|
||||
;;
|
||||
|
||||
stop)
|
||||
echo "Deallocating VM (stops billing)..."
|
||||
az vm deallocate --resource-group "$RG" --name "$VM"
|
||||
echo "Deallocated."
|
||||
;;
|
||||
|
||||
ip)
|
||||
az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv
|
||||
;;
|
||||
|
||||
ssh)
|
||||
IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
|
||||
ssh -A azureuser@"$IP" "${@:2}"
|
||||
;;
|
||||
|
||||
bench)
|
||||
BRANCH="${2:?Usage: $0 bench <branch>}"
|
||||
IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
|
||||
ssh -A azureuser@"$IP" "bash ~/bench/bench_blackbox.sh $BRANCH"
|
||||
;;
|
||||
|
||||
ab)
|
||||
BASE="${2:?Usage: $0 ab <base-branch> <opt-branch>}"
|
||||
OPT="${3:?Usage: $0 ab <base-branch> <opt-branch>}"
|
||||
IP=$(az vm show -g "$RG" -n "$VM" -d --query publicIps -o tsv)
|
||||
ssh -A azureuser@"$IP" "bash ~/bench/bench_ab.sh $BASE $OPT"
|
||||
;;
|
||||
|
||||
destroy)
|
||||
echo "Destroying resource group (all resources)..."
|
||||
az group delete --name "$RG" --yes --no-wait
|
||||
echo "Deletion started."
|
||||
;;
|
||||
|
||||
help|*)
|
||||
echo "Usage: $0 {create|start|stop|ip|ssh|bench <branch>|ab <base> <opt>|destroy}"
|
||||
echo ""
|
||||
echo " create - Provision VM with cloud-init"
|
||||
echo " start - Start deallocated VM"
|
||||
echo " stop - Deallocate VM (stops billing)"
|
||||
echo " ip - Show VM public IP"
|
||||
echo " ssh - SSH into VM with agent forwarding"
|
||||
echo " bench - Run benchmarks on a branch"
|
||||
echo " ab - A/B comparison between two branches"
|
||||
echo " destroy - Delete resource group and all resources"
|
||||
;;
|
||||
esac
|
||||
Loading…
Reference in a new issue