codeflash-agent/.codeflash/krrt7/codeflash-ai/blackbox/infra/cloud-init.yaml
Kevin Turcios 1ff2a76152
perf(analytics): use rfind and local json.loads (#44)
* perf(analytics): use rfind and local json.loads for hot paths

Replace Path().suffix with string rfind for extension extraction,
use local json.loads binding and bytes split for JSONL parsing.

* fix: use splitlines and preserve extensionless file behavior

split("\n") mishandles \r\n line endings. The early return on
extensionless files changed behavior vs the original Path().suffix
which returned "" and fell through. Use splitlines() and let
extensionless files fall through with lang=None.

* style: use ternary for extensionless file check per SIM108

* Add blackbox benchmark VM infra

D2s_v5 (non-burstable, 2 vCPU, 8 GB) with cloud-init provisioning,
CPU-pinned benchmarks, and A/B comparison scripts.

---------

Co-authored-by: codeflash[bot] <codeflash[bot]@users.noreply.github.com>
2026-04-29 03:22:42 -05:00

256 lines
9.2 KiB
YAML

#cloud-config
#
# Benchmark VM provisioning for blackbox package (codeflash-agent monorepo)
#
# Pure Python package -- no system-level deps beyond build tools.
# Private repo: requires SSH agent forwarding for clone.
#
# Two-phase setup:
# Phase 1 (cloud-init): packages, hyperfine, uv
# Phase 2 (manual): ssh -A, clone, uv sync, baseline benchmarks
#
# Usage:
# az vm create ... --custom-data infra/cloud-init.yaml
# bash infra/vm-manage.sh ssh
# bash ~/setup.sh
#
# VM: Azure Standard_D2s_v5 (2 vCPU, 8 GB RAM, general-purpose)
# Smallest non-burstable option -- blackbox is CPU-bound, not memory-bound.
# Use taskset -c 0 to pin benchmarks to 1 core for consistent results.
# Non-burstable ensures consistent CPU -- no credit depletion or turbo variability.
package_update: true
packages:
- git
- build-essential
- curl
write_files:
# --- Benchmark: blackbox functions (main vs branch) ---
- path: /home/azureuser/bench/bench_blackbox.sh
owner: azureuser:azureuser
permissions: "0755"
defer: true
content: |
#!/usr/bin/env bash
set -euo pipefail
BRANCH="${1:-HEAD}"
PYTHON=.venv/bin/python
cd ~/codeflash-agent
echo "=== Benchmarking blackbox: $BRANCH ==="
git fetch origin
git checkout "$BRANCH"
uv sync
taskset -c 0 $PYTHON /home/azureuser/bench/bench_functions.py \
~/codeflash-agent
# --- Benchmark: A/B comparison ---
- path: /home/azureuser/bench/bench_ab.sh
owner: azureuser:azureuser
permissions: "0755"
defer: true
content: |
#!/usr/bin/env bash
set -euo pipefail
BASE="${1:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
OPT="${2:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
echo "=== A/B comparison: $BASE vs $OPT ==="
echo ""
echo "--- BASELINE: $BASE ---"
bash ~/bench/bench_blackbox.sh "$BASE"
echo ""
echo "--- OPTIMIZED: $OPT ---"
bash ~/bench/bench_blackbox.sh "$OPT"
# --- Benchmark: Python script for per-function timing ---
- path: /home/azureuser/bench/bench_functions.py
owner: azureuser:azureuser
permissions: "0644"
defer: true
content: |
"""Benchmark blackbox hot-path functions -- min-of-5 runs per function."""
from __future__ import annotations
import inspect
import json
import sys
import tempfile
import timeit
from collections import Counter
from pathlib import Path
sys.path.insert(0, str(Path(sys.argv[1]) / "packages/blackbox/src"))
from blackbox.models import (
CODEFLASH_AGENT_PREFIXES,
CODEFLASH_COMMANDS,
CODEFLASH_SKILLS,
LogEntry,
)
def _build_transcript(n_lines: int = 500) -> Path:
entries = []
for i in range(n_lines):
ts = f"2025-01-15T10:{i // 60:02d}:{i % 60:02d}Z"
if i % 3 == 0:
entries.append(json.dumps({
"type": "user", "timestamp": ts, "sessionId": "sess-bench",
"cwd": "/home/user/project", "gitBranch": "feature-x",
"message": {"content": f"User message {i}"},
}))
elif i % 3 == 1:
entries.append(json.dumps({
"type": "assistant", "timestamp": ts,
"message": {
"content": [
{"type": "text", "text": f"Step {i}."},
{"type": "tool_use", "id": f"tool_{i}", "name": "Write",
"input": {"file_path": f"/project/mod_{i % 10}.py",
"content": "def f():\n pass\n"}},
{"type": "tool_use", "id": f"tool_{i}b", "name": "Bash",
"input": {"command": f"git commit -m 'step {i}'"}},
],
"usage": {"input_tokens": 1000, "output_tokens": 200,
"cache_read_input_tokens": 50,
"cache_creation_input_tokens": 25},
},
}))
else:
entries.append(json.dumps({
"type": "user", "timestamp": ts,
"message": {"content": [
{"type": "tool_result", "tool_use_id": f"tool_{i - 1}",
"is_error": i % 15 == 0,
"content": "OK" if i % 15 != 0 else "Error: exit code 1"}
]},
}))
tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode="w")
tmp.write("\n".join(entries))
tmp.close()
return Path(tmp.name)
def _build_log_entries(n: int = 200) -> list[LogEntry]:
entries = []
levels = ["assistant", "tool_call", "tool_result", "status", "error", "info"]
for i in range(n):
lvl = levels[i % len(levels)]
entries.append(LogEntry(
timestamp=1700000000.0 + i,
source="claude" if lvl in ("assistant", "tool_call") else "user",
level=lvl,
message=f"Sample message {i} with /path/to/some/file.py content",
data={"tool_name": "Write", "preview": "edit file.py"}
if lvl == "tool_call" else {},
))
return entries
def best_of(fn, rounds: int = 5) -> float:
return min(fn() for _ in range(rounds))
def main() -> None:
transcript = _build_transcript(500)
entries = _build_log_entries(200)
from blackbox.analytics import extract_meta, track_file_changes
from blackbox.dashboard.transcript import parse_transcript
from blackbox.dashboard.rendering import render_log_html
sig = inspect.signature(track_file_changes)
has_languages = "languages" in sig.parameters
tool_inputs = [
("Write", {"file_path": "/project/src/main.py", "content": "x = 1\ny = 2\n"}),
("Edit", {"file_path": "/project/src/utils.py", "old_string": "a", "new_string": "b"}),
("Write", {"file_path": "/project/README.md", "content": "hello"}),
("Write", {"file_path": "/project/Makefile", "content": "all:"}),
("Write", {"file_path": "/project/src/app.tsx", "content": "export default () => {}"}),
]
test_vals = [
"codeflash-python", "codeflash", "random-agent", "codeflash-review",
"/optimize", "/status", "unknown-cmd", "/benchmark",
"other-prefix", "codeflash-researcher",
]
# Warmup
extract_meta(transcript)
parse_transcript(transcript)
for e in entries:
render_log_html(e)
t1 = best_of(lambda: timeit.timeit(
lambda: extract_meta(transcript), number=200) / 200)
def _track():
f: set[str] = set()
langs: Counter[str] = Counter()
for tn, ti in tool_inputs:
if has_languages:
track_file_changes(tn, ti, f, langs)
else:
track_file_changes(tn, ti, f)
t2 = best_of(lambda: timeit.timeit(_track, number=10000) / 10000)
t3 = best_of(lambda: timeit.timeit(
lambda: parse_transcript(transcript), number=200) / 200)
def _render():
for e in entries:
render_log_html(e)
t4 = best_of(lambda: timeit.timeit(_render, number=1000) / 1000)
def _member():
for v in test_vals:
_ = v in CODEFLASH_AGENT_PREFIXES
_ = v in CODEFLASH_SKILLS
_ = v in CODEFLASH_COMMANDS
t5 = best_of(lambda: timeit.timeit(_member, number=100000) / 100000)
print(f"extract_meta {t1*1000:.4f} ms")
print(f"track_file_changes {t2*1000:.4f} ms")
print(f"parse_transcript {t3*1000:.4f} ms")
print(f"render_log_html {t4*1000:.4f} ms")
print(f"membership {t5*1000:.6f} ms")
print(f"TOTAL {(t1+t2+t3+t4+t5)*1000:.4f} ms")
transcript.unlink(missing_ok=True)
if __name__ == "__main__":
main()
# --- Post-provision setup (run manually after ssh -A) ---
- path: /home/azureuser/setup.sh
owner: azureuser:azureuser
permissions: "0755"
defer: true
content: |
#!/usr/bin/env bash
set -euo pipefail
export PATH="$HOME/.local/bin:$PATH"
echo "=== Cloning codeflash-agent ==="
git clone git@github.com:codeflash-ai/codeflash-agent.git ~/codeflash-agent
cd ~/codeflash-agent
echo "=== Installing dependencies ==="
uv sync
echo "=== Creating results directory ==="
mkdir -p ~/results
echo "=== Verifying installation ==="
.venv/bin/python -c 'from blackbox.models import LogEntry; print("OK")'
echo "=== Done ==="
runcmd:
- wget -q https://github.com/sharkdp/hyperfine/releases/download/v1.19.0/hyperfine_1.19.0_amd64.deb -O /tmp/hyperfine.deb
- dpkg -i /tmp/hyperfine.deb
- su - azureuser -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'