codeflash-agent/evals/run-eval.sh

352 lines
12 KiB
Bash
Raw Normal View History

2026-03-24 21:14:04 +00:00
#!/bin/bash
set -euo pipefail
# Eval runner for codeflash-agent plugin
# Usage:
# ./run-eval.sh <template> # run both skill + baseline
# ./run-eval.sh <template> --skill-only # run with-skill only
# ./run-eval.sh <template> --baseline-only # run baseline only
# ./run-eval.sh <template> --runs 3 # run 3 times, aggregate
2026-03-24 21:14:04 +00:00
# ./run-eval.sh --list # list available templates
EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"
TEMPLATES_DIR="$EVAL_DIR/templates"
REPOS_DIR="$EVAL_DIR/repos"
2026-03-24 21:14:04 +00:00
RESULTS_DIR="$EVAL_DIR/results"
REPO_ROOT="$(cd "$EVAL_DIR/.." && pwd)"
PLUGIN_DIR="$REPO_ROOT"
# --- Helpers ---
die() { echo "ERROR: $*" >&2; exit 1; }
# Find eval directory (templates/ or repos/)
find_eval_dir() {
local name=$1
if [ -d "$TEMPLATES_DIR/$name" ]; then
echo "$TEMPLATES_DIR/$name"
elif [ -d "$REPOS_DIR/$name" ]; then
echo "$REPOS_DIR/$name"
else
echo ""
fi
}
2026-03-24 21:14:04 +00:00
list_templates() {
echo "Available evals:"
echo ""
echo " Templates (v1 — bundled source):"
2026-03-24 21:14:04 +00:00
for d in "$TEMPLATES_DIR"/*/; do
[ -d "$d" ] || continue
2026-03-24 21:14:04 +00:00
name=$(basename "$d")
desc=$(jq -r '.description // "no description"' "$d/manifest.json" 2>/dev/null || echo "no manifest")
printf " %-30s %s\n" "$name" "$desc"
2026-03-24 21:14:04 +00:00
done
echo ""
echo " Repos (v2 — cloned from git):"
if [ -d "$REPOS_DIR" ]; then
for d in "$REPOS_DIR"/*/; do
[ -d "$d" ] || continue
name=$(basename "$d")
repo=$(jq -r '.repo // "?"' "$d/manifest.json" 2>/dev/null)
desc=$(jq -r '.description // "no description"' "$d/manifest.json" 2>/dev/null || echo "no manifest")
printf " %-30s %s (%s)\n" "$name" "$desc" "$repo"
done
else
echo " (none)"
fi
2026-03-24 21:14:04 +00:00
}
setup_workspace_v1() {
2026-03-24 21:14:04 +00:00
local template=$1 label=$2
local workdir
workdir=$(mktemp -d)/eval-${label}-${template}
cp -r "$TEMPLATES_DIR/$template" "$workdir"
# Remove any leftover .venv / cache
rm -rf "$workdir/.venv" "$workdir/.pytest_cache" "$workdir/__pycache__"
# Install deps
(cd "$workdir" && uv sync --quiet 2>/dev/null)
echo "$workdir"
}
setup_workspace_v2() {
local eval_name=$1 label=$2 manifest=$3
local repo commit cache_dir workdir
repo=$(jq -r '.repo' "$manifest")
commit=$(jq -r '.commit' "$manifest")
cache_dir="$(dirname "$manifest")/workspace"
workdir=$(mktemp -d)/eval-${label}-${eval_name}
# Clone to cache dir if not already present
if [ ! -d "$cache_dir/.git" ]; then
echo "Cloning $repo at $commit (first run — caching to $cache_dir)..." >&2
gh repo clone "$repo" "$cache_dir" -- --quiet --no-checkout --depth 1
(cd "$cache_dir" \
&& git fetch --quiet --depth 1 origin "$commit" \
&& git checkout --quiet FETCH_HEAD)
else
echo "Using cached clone from $cache_dir" >&2
fi
# Copy cached workspace to isolated temp dir for this run
cp -r "$cache_dir" "$workdir"
echo "$workdir"
}
2026-03-24 21:14:04 +00:00
build_prompt() {
local manifest=$1
local version
version=$(jq -r '.version // 1' "$manifest")
# v2: prompt is in the manifest
if [ "$version" -ge 2 ]; then
jq -r '.prompt' "$manifest"
return
fi
# v1: derive prompt from eval_type
2026-03-24 21:14:04 +00:00
local eval_type
eval_type=$(jq -r '.eval_type' "$manifest")
case "$eval_type" in
crossdomain)
echo "The tests in this project are too slow. Profile the code, identify the performance bottlenecks in each module, and optimize them. Run the tests before and after to verify correctness and measure speedup."
;;
layered)
echo "test_process_large_batch is too slow and uses too much memory. Profile and optimize process_records. Run the test before and after to verify correctness and measure improvement."
;;
ranking|ranking-hard)
echo "test_large_batch in test_pipeline.py is too slow. Profile run_pipeline, find what's slow, and optimize. Run the test before and after to verify correctness and measure speedup."
;;
memory)
echo "process_dataset in test_large_dataset uses too much memory — peak RSS is 3x higher than the input data size. Profile memory usage, find what's allocating the most, and reduce peak memory. Run the test before and after to verify correctness."
;;
memory-hard)
echo "process_readings in test_large_dataset uses too much memory — peak is 3x higher than the input data. Profile memory usage step by step, find the dominant allocator, and reduce peak memory. Run the test before and after to verify correctness."
;;
memory-balanced)
echo "process_orders in test_large_batch uses too much memory — peak is over 2x the input data. Profile memory at each pipeline stage, find what's adding overhead, and reduce it. Run the test before and after to verify correctness."
;;
memory-misdirection)
echo "process_transactions in test_large_batch uses too much memory — peak is nearly 3x the input data. Profile memory usage, identify the biggest allocators in order of impact, and fix them. Run the test before and after to verify correctness."
;;
*)
echo "The tests in this project are too slow. Profile the code, find the performance bottlenecks, and optimize them. Run the tests before and after to verify correctness and measure improvement."
;;
esac
}
run_claude() {
local workdir=$1 label=$2 result_prefix=$3 use_plugin=$4
local start_time end_time duration
echo "=== Running $label in $workdir ==="
start_time=$(date +%s)
local prompt
prompt=$(cat "$workdir/.eval-prompt")
# When running with plugin, instruct Claude to invoke the skill
if [ "$use_plugin" = "true" ]; then
prompt="IMPORTANT: You MUST use the Skill tool to invoke '/codeflash-optimize' for this task. Do NOT solve the problem directly — delegate to the skill first. This is a non-interactive eval — there is no user to respond. When the skill tells you to ask the user a question, DO NOT ask and DO NOT stop. Instead, immediately launch the codeflash agent with the task below. No constraints apply. Task: $prompt"
2026-03-24 21:14:04 +00:00
fi
local claude_args=(-p "$prompt" --output-format json --verbose)
if [ "$use_plugin" = "true" ]; then
claude_args+=(--plugin-dir "$PLUGIN_DIR")
fi
# Run claude in the workspace dir
# --dangerously-skip-permissions: evals run in temp dirs, safe to allow all tools
2026-04-09 08:36:01 +00:00
local exit_code=0
2026-03-24 21:14:04 +00:00
(cd "$workdir" && claude "${claude_args[@]}" --dangerously-skip-permissions) \
2026-04-09 08:36:01 +00:00
> "${result_prefix}.json" 2> "${result_prefix}.stderr" || exit_code=$?
2026-03-24 21:14:04 +00:00
end_time=$(date +%s)
duration=$((end_time - start_time))
2026-04-09 08:36:01 +00:00
echo "$exit_code" > "${result_prefix}.exitcode"
echo "$label completed in ${duration}s (exit code: $exit_code)"
2026-03-24 21:14:04 +00:00
echo "$duration" > "${result_prefix}.duration"
2026-04-09 08:36:01 +00:00
# Detect empty/missing output — likely a crash or timeout
if [ ! -s "${result_prefix}.json" ]; then
echo "WARNING: $label produced no output (exit code: $exit_code)" >&2
echo "CRASH DIAGNOSTIC: claude exited with code $exit_code after ${duration}s. No JSON output produced." \
>> "${result_prefix}.stderr"
if [ "$exit_code" -eq 124 ] || [ "$duration" -gt 1800 ]; then
echo "LIKELY CAUSE: timeout (duration=${duration}s)" >> "${result_prefix}.stderr"
elif [ "$exit_code" -eq 137 ] || [ "$exit_code" -eq 139 ]; then
echo "LIKELY CAUSE: OOM or signal kill (exit code=$exit_code)" >> "${result_prefix}.stderr"
fi
fi
2026-03-24 21:14:04 +00:00
# Run tests post-optimization to check correctness + timing
local test_cmd
test_cmd=$(jq -r '.test_command // empty' "$RUN_DIR/manifest.json")
if [ -n "$test_cmd" ]; then
echo "Running post-optimization tests..."
(cd "$workdir" && eval "$test_cmd") > "${result_prefix}.tests" 2>&1 || true
else
echo "No test_command in manifest — skipping post-optimization tests"
touch "${result_prefix}.tests"
fi
2026-03-24 21:14:04 +00:00
}
run_single() {
# Run a single eval iteration into $RUN_DIR
local eval_name=$1 mode=$2 manifest=$3 prompt=$4 version=$5
echo "$PROMPT" > /dev/null # ensure PROMPT is available
# --- With-skill run ---
if [ "$mode" != "--baseline-only" ]; then
local skill_workdir
if [ "$version" -ge 2 ]; then
skill_workdir=$(setup_workspace_v2 "$eval_name" "skill" "$manifest")
else
skill_workdir=$(setup_workspace_v1 "$eval_name" "skill")
fi
echo "$prompt" > "$skill_workdir/.eval-prompt"
run_claude "$skill_workdir" "with-skill" "$RUN_DIR/skill" "true"
echo ""
fi
# --- Baseline run ---
if [ "$mode" != "--skill-only" ]; then
local baseline_workdir
if [ "$version" -ge 2 ]; then
baseline_workdir=$(setup_workspace_v2 "$eval_name" "baseline" "$manifest")
else
baseline_workdir=$(setup_workspace_v1 "$eval_name" "baseline")
fi
echo "$prompt" > "$baseline_workdir/.eval-prompt"
run_claude "$baseline_workdir" "baseline" "$RUN_DIR/baseline" "false"
echo ""
fi
}
2026-03-24 21:14:04 +00:00
# --- Main ---
if [ "${1:-}" = "--list" ]; then
list_templates
exit 0
fi
# --- Parse args ---
eval_name=""
mode="--both"
num_runs=1
while [[ $# -gt 0 ]]; do
case "$1" in
--skill-only|--baseline-only)
mode="$1"
shift
;;
--runs)
num_runs="${2:?--runs requires a number}"
shift 2
;;
-*)
die "Unknown flag: $1"
;;
*)
eval_name="$1"
shift
;;
esac
done
[[ -n "$eval_name" ]] || die "Usage: $0 <eval-name> [--skill-only|--baseline-only] [--runs N]"
2026-03-24 21:14:04 +00:00
EVAL_SOURCE_DIR=$(find_eval_dir "$eval_name")
[ -n "$EVAL_SOURCE_DIR" ] || die "Eval not found: $eval_name. Use --list to see available evals."
MANIFEST="$EVAL_SOURCE_DIR/manifest.json"
[ -f "$MANIFEST" ] || die "No manifest.json in: $EVAL_SOURCE_DIR"
VERSION=$(jq -r '.version // 1' "$MANIFEST")
2026-03-24 21:14:04 +00:00
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
# Build prompt
PROMPT=$(build_prompt "$MANIFEST")
2026-03-24 21:14:04 +00:00
if [ "$num_runs" -gt 1 ]; then
# Multi-run mode: create parent dir with run-1/, run-2/, etc.
PARENT_DIR="$RESULTS_DIR/${eval_name}-${TIMESTAMP}-${num_runs}runs"
mkdir -p "$PARENT_DIR"
cp "$MANIFEST" "$PARENT_DIR/manifest.json"
echo "Eval: $eval_name (v$VERSION) — $num_runs runs"
echo "Results: $PARENT_DIR"
echo "Prompt: $PROMPT"
2026-03-24 21:14:04 +00:00
echo ""
for i in $(seq 1 "$num_runs"); do
echo "========================================"
echo " Run $i / $num_runs"
echo "========================================"
RUN_DIR="$PARENT_DIR/run-$i"
mkdir -p "$RUN_DIR"
cp "$MANIFEST" "$RUN_DIR/manifest.json"
run_single "$eval_name" "$mode" "$MANIFEST" "$PROMPT" "$VERSION"
# Score this run immediately
"$EVAL_DIR/score-eval.sh" "$RUN_DIR"
echo ""
done
# Aggregate across runs
echo "========================================"
echo " Aggregating $num_runs runs"
echo "========================================"
python3 "$EVAL_DIR/score.py" aggregate "$PARENT_DIR"
echo "=== Results ==="
echo "Directory: $PARENT_DIR"
echo "Runs: $num_runs"
echo ""
echo "Files:"
ls -1 "$PARENT_DIR/"
else
# Single-run mode (original behavior)
RUN_DIR="$RESULTS_DIR/${eval_name}-${TIMESTAMP}"
mkdir -p "$RUN_DIR"
cp "$MANIFEST" "$RUN_DIR/manifest.json"
echo "Eval: $eval_name (v$VERSION)"
echo "Results: $RUN_DIR"
echo "Prompt: $PROMPT"
2026-03-24 21:14:04 +00:00
echo ""
run_single "$eval_name" "$mode" "$MANIFEST" "$PROMPT" "$VERSION"
2026-03-24 21:14:04 +00:00
echo "=== Results ==="
echo "Directory: $RUN_DIR"
echo ""
2026-03-24 21:14:04 +00:00
if [ -f "$RUN_DIR/skill.duration" ] && [ -f "$RUN_DIR/baseline.duration" ]; then
skill_dur=$(cat "$RUN_DIR/skill.duration")
baseline_dur=$(cat "$RUN_DIR/baseline.duration")
echo "With-skill duration: ${skill_dur}s"
echo "Baseline duration: ${baseline_dur}s"
fi
echo ""
echo "Files:"
ls -1 "$RUN_DIR/"
echo ""
echo "Next step: run ./score-eval.sh $RUN_DIR to score the results"
fi