mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
* perf(analytics): use rfind and local json.loads for hot paths
Replace Path().suffix with string rfind for extension extraction,
use local json.loads binding and bytes split for JSONL parsing.
* fix: use splitlines and preserve extensionless file behavior
split("\n") mishandles \r\n line endings. The early return on
extensionless files changed behavior vs the original Path().suffix
which returned "" and fell through. Use splitlines() and let
extensionless files fall through with lang=None.
* style: use ternary for extensionless file check per SIM108
* Add blackbox benchmark VM infra
D2s_v5 (non-burstable, 2 vCPU, 8 GB) with cloud-init provisioning,
CPU-pinned benchmarks, and A/B comparison scripts.
---------
Co-authored-by: codeflash[bot] <codeflash[bot]@users.noreply.github.com>
256 lines
9.2 KiB
YAML
256 lines
9.2 KiB
YAML
#cloud-config
|
|
#
|
|
# Benchmark VM provisioning for blackbox package (codeflash-agent monorepo)
|
|
#
|
|
# Pure Python package -- no system-level deps beyond build tools.
|
|
# Private repo: requires SSH agent forwarding for clone.
|
|
#
|
|
# Two-phase setup:
|
|
# Phase 1 (cloud-init): packages, hyperfine, uv
|
|
# Phase 2 (manual): ssh -A, clone, uv sync, baseline benchmarks
|
|
#
|
|
# Usage:
|
|
# az vm create ... --custom-data infra/cloud-init.yaml
|
|
# bash infra/vm-manage.sh ssh
|
|
# bash ~/setup.sh
|
|
#
|
|
# VM: Azure Standard_D2s_v5 (2 vCPU, 8 GB RAM, general-purpose)
|
|
# Smallest non-burstable option -- blackbox is CPU-bound, not memory-bound.
|
|
# Use taskset -c 0 to pin benchmarks to 1 core for consistent results.
|
|
# Non-burstable ensures consistent CPU -- no credit depletion or turbo variability.
|
|
|
|
package_update: true
|
|
packages:
|
|
- git
|
|
- build-essential
|
|
- curl
|
|
|
|
write_files:
|
|
# --- Benchmark: blackbox functions (main vs branch) ---
|
|
- path: /home/azureuser/bench/bench_blackbox.sh
|
|
owner: azureuser:azureuser
|
|
permissions: "0755"
|
|
defer: true
|
|
content: |
|
|
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
BRANCH="${1:-HEAD}"
|
|
PYTHON=.venv/bin/python
|
|
cd ~/codeflash-agent
|
|
|
|
echo "=== Benchmarking blackbox: $BRANCH ==="
|
|
git fetch origin
|
|
git checkout "$BRANCH"
|
|
uv sync
|
|
|
|
taskset -c 0 $PYTHON /home/azureuser/bench/bench_functions.py \
|
|
~/codeflash-agent
|
|
|
|
# --- Benchmark: A/B comparison ---
|
|
- path: /home/azureuser/bench/bench_ab.sh
|
|
owner: azureuser:azureuser
|
|
permissions: "0755"
|
|
defer: true
|
|
content: |
|
|
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
BASE="${1:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
|
|
OPT="${2:?Usage: bench_ab.sh <base-branch> <opt-branch>}"
|
|
|
|
echo "=== A/B comparison: $BASE vs $OPT ==="
|
|
echo ""
|
|
echo "--- BASELINE: $BASE ---"
|
|
bash ~/bench/bench_blackbox.sh "$BASE"
|
|
echo ""
|
|
echo "--- OPTIMIZED: $OPT ---"
|
|
bash ~/bench/bench_blackbox.sh "$OPT"
|
|
|
|
# --- Benchmark: Python script for per-function timing ---
|
|
- path: /home/azureuser/bench/bench_functions.py
|
|
owner: azureuser:azureuser
|
|
permissions: "0644"
|
|
defer: true
|
|
content: |
|
|
"""Benchmark blackbox hot-path functions -- min-of-5 runs per function."""
|
|
from __future__ import annotations
|
|
|
|
import inspect
|
|
import json
|
|
import sys
|
|
import tempfile
|
|
import timeit
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(sys.argv[1]) / "packages/blackbox/src"))
|
|
|
|
from blackbox.models import (
|
|
CODEFLASH_AGENT_PREFIXES,
|
|
CODEFLASH_COMMANDS,
|
|
CODEFLASH_SKILLS,
|
|
LogEntry,
|
|
)
|
|
|
|
|
|
def _build_transcript(n_lines: int = 500) -> Path:
|
|
entries = []
|
|
for i in range(n_lines):
|
|
ts = f"2025-01-15T10:{i // 60:02d}:{i % 60:02d}Z"
|
|
if i % 3 == 0:
|
|
entries.append(json.dumps({
|
|
"type": "user", "timestamp": ts, "sessionId": "sess-bench",
|
|
"cwd": "/home/user/project", "gitBranch": "feature-x",
|
|
"message": {"content": f"User message {i}"},
|
|
}))
|
|
elif i % 3 == 1:
|
|
entries.append(json.dumps({
|
|
"type": "assistant", "timestamp": ts,
|
|
"message": {
|
|
"content": [
|
|
{"type": "text", "text": f"Step {i}."},
|
|
{"type": "tool_use", "id": f"tool_{i}", "name": "Write",
|
|
"input": {"file_path": f"/project/mod_{i % 10}.py",
|
|
"content": "def f():\n pass\n"}},
|
|
{"type": "tool_use", "id": f"tool_{i}b", "name": "Bash",
|
|
"input": {"command": f"git commit -m 'step {i}'"}},
|
|
],
|
|
"usage": {"input_tokens": 1000, "output_tokens": 200,
|
|
"cache_read_input_tokens": 50,
|
|
"cache_creation_input_tokens": 25},
|
|
},
|
|
}))
|
|
else:
|
|
entries.append(json.dumps({
|
|
"type": "user", "timestamp": ts,
|
|
"message": {"content": [
|
|
{"type": "tool_result", "tool_use_id": f"tool_{i - 1}",
|
|
"is_error": i % 15 == 0,
|
|
"content": "OK" if i % 15 != 0 else "Error: exit code 1"}
|
|
]},
|
|
}))
|
|
tmp = tempfile.NamedTemporaryFile(suffix=".jsonl", delete=False, mode="w")
|
|
tmp.write("\n".join(entries))
|
|
tmp.close()
|
|
return Path(tmp.name)
|
|
|
|
|
|
def _build_log_entries(n: int = 200) -> list[LogEntry]:
|
|
entries = []
|
|
levels = ["assistant", "tool_call", "tool_result", "status", "error", "info"]
|
|
for i in range(n):
|
|
lvl = levels[i % len(levels)]
|
|
entries.append(LogEntry(
|
|
timestamp=1700000000.0 + i,
|
|
source="claude" if lvl in ("assistant", "tool_call") else "user",
|
|
level=lvl,
|
|
message=f"Sample message {i} with /path/to/some/file.py content",
|
|
data={"tool_name": "Write", "preview": "edit file.py"}
|
|
if lvl == "tool_call" else {},
|
|
))
|
|
return entries
|
|
|
|
|
|
def best_of(fn, rounds: int = 5) -> float:
|
|
return min(fn() for _ in range(rounds))
|
|
|
|
|
|
def main() -> None:
|
|
transcript = _build_transcript(500)
|
|
entries = _build_log_entries(200)
|
|
|
|
from blackbox.analytics import extract_meta, track_file_changes
|
|
from blackbox.dashboard.transcript import parse_transcript
|
|
from blackbox.dashboard.rendering import render_log_html
|
|
|
|
sig = inspect.signature(track_file_changes)
|
|
has_languages = "languages" in sig.parameters
|
|
|
|
tool_inputs = [
|
|
("Write", {"file_path": "/project/src/main.py", "content": "x = 1\ny = 2\n"}),
|
|
("Edit", {"file_path": "/project/src/utils.py", "old_string": "a", "new_string": "b"}),
|
|
("Write", {"file_path": "/project/README.md", "content": "hello"}),
|
|
("Write", {"file_path": "/project/Makefile", "content": "all:"}),
|
|
("Write", {"file_path": "/project/src/app.tsx", "content": "export default () => {}"}),
|
|
]
|
|
test_vals = [
|
|
"codeflash-python", "codeflash", "random-agent", "codeflash-review",
|
|
"/optimize", "/status", "unknown-cmd", "/benchmark",
|
|
"other-prefix", "codeflash-researcher",
|
|
]
|
|
|
|
# Warmup
|
|
extract_meta(transcript)
|
|
parse_transcript(transcript)
|
|
for e in entries:
|
|
render_log_html(e)
|
|
|
|
t1 = best_of(lambda: timeit.timeit(
|
|
lambda: extract_meta(transcript), number=200) / 200)
|
|
|
|
def _track():
|
|
f: set[str] = set()
|
|
langs: Counter[str] = Counter()
|
|
for tn, ti in tool_inputs:
|
|
if has_languages:
|
|
track_file_changes(tn, ti, f, langs)
|
|
else:
|
|
track_file_changes(tn, ti, f)
|
|
t2 = best_of(lambda: timeit.timeit(_track, number=10000) / 10000)
|
|
|
|
t3 = best_of(lambda: timeit.timeit(
|
|
lambda: parse_transcript(transcript), number=200) / 200)
|
|
|
|
def _render():
|
|
for e in entries:
|
|
render_log_html(e)
|
|
t4 = best_of(lambda: timeit.timeit(_render, number=1000) / 1000)
|
|
|
|
def _member():
|
|
for v in test_vals:
|
|
_ = v in CODEFLASH_AGENT_PREFIXES
|
|
_ = v in CODEFLASH_SKILLS
|
|
_ = v in CODEFLASH_COMMANDS
|
|
t5 = best_of(lambda: timeit.timeit(_member, number=100000) / 100000)
|
|
|
|
print(f"extract_meta {t1*1000:.4f} ms")
|
|
print(f"track_file_changes {t2*1000:.4f} ms")
|
|
print(f"parse_transcript {t3*1000:.4f} ms")
|
|
print(f"render_log_html {t4*1000:.4f} ms")
|
|
print(f"membership {t5*1000:.6f} ms")
|
|
print(f"TOTAL {(t1+t2+t3+t4+t5)*1000:.4f} ms")
|
|
|
|
transcript.unlink(missing_ok=True)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
# --- Post-provision setup (run manually after ssh -A) ---
|
|
- path: /home/azureuser/setup.sh
|
|
owner: azureuser:azureuser
|
|
permissions: "0755"
|
|
defer: true
|
|
content: |
|
|
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
export PATH="$HOME/.local/bin:$PATH"
|
|
|
|
echo "=== Cloning codeflash-agent ==="
|
|
git clone git@github.com:codeflash-ai/codeflash-agent.git ~/codeflash-agent
|
|
cd ~/codeflash-agent
|
|
|
|
echo "=== Installing dependencies ==="
|
|
uv sync
|
|
|
|
echo "=== Creating results directory ==="
|
|
mkdir -p ~/results
|
|
|
|
echo "=== Verifying installation ==="
|
|
.venv/bin/python -c 'from blackbox.models import LogEntry; print("OK")'
|
|
|
|
echo "=== Done ==="
|
|
|
|
runcmd:
|
|
- wget -q https://github.com/sharkdp/hyperfine/releases/download/v1.19.0/hyperfine_1.19.0_amd64.deb -O /tmp/hyperfine.deb
|
|
- dpkg -i /tmp/hyperfine.deb
|
|
- su - azureuser -c 'curl -LsSf https://astral.sh/uv/install.sh | sh'
|