- Optimize codeflash-optimize SKILL.md (review score 17% → 98%, eval 87% → 100%) - Fix frontmatter (allowed-tools format, argument-hint under metadata) - Lead description with concrete actions, explicit agent launch parameters - Add multi-run variance detection to eval system (--runs N flag) - score.py aggregate command: min/max/avg/stddev per criterion, flaky detection - check-regression.sh defaults to 3 runs for reliable regression detection - Add per-criterion regression tracking to baseline-scores.json (v3) - Reports exactly which criteria regressed, not just total score drops - Rename evals/ → codeflash-evals/ to avoid tessl directory conflicts - Switch tessl to managed mode, gitignore vendored tiles and symlinks
337 lines
12 KiB
Bash
Executable file
337 lines
12 KiB
Bash
Executable file
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# Eval runner for codeflash-agent plugin
|
|
# Usage:
|
|
# ./run-eval.sh <template> # run both skill + baseline
|
|
# ./run-eval.sh <template> --skill-only # run with-skill only
|
|
# ./run-eval.sh <template> --baseline-only # run baseline only
|
|
# ./run-eval.sh <template> --runs 3 # run 3 times, aggregate
|
|
# ./run-eval.sh --list # list available templates
|
|
|
|
EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
TEMPLATES_DIR="$EVAL_DIR/templates"
|
|
REPOS_DIR="$EVAL_DIR/repos"
|
|
RESULTS_DIR="$EVAL_DIR/results"
|
|
REPO_ROOT="$(cd "$EVAL_DIR/.." && pwd)"
|
|
PLUGIN_DIR="$REPO_ROOT"
|
|
|
|
# --- Helpers ---
|
|
|
|
die() { echo "ERROR: $*" >&2; exit 1; }
|
|
|
|
# Find eval directory (templates/ or repos/)
|
|
find_eval_dir() {
|
|
local name=$1
|
|
if [ -d "$TEMPLATES_DIR/$name" ]; then
|
|
echo "$TEMPLATES_DIR/$name"
|
|
elif [ -d "$REPOS_DIR/$name" ]; then
|
|
echo "$REPOS_DIR/$name"
|
|
else
|
|
echo ""
|
|
fi
|
|
}
|
|
|
|
list_templates() {
|
|
echo "Available evals:"
|
|
echo ""
|
|
echo " Templates (v1 — bundled source):"
|
|
for d in "$TEMPLATES_DIR"/*/; do
|
|
[ -d "$d" ] || continue
|
|
name=$(basename "$d")
|
|
desc=$(jq -r '.description // "no description"' "$d/manifest.json" 2>/dev/null || echo "no manifest")
|
|
printf " %-30s %s\n" "$name" "$desc"
|
|
done
|
|
echo ""
|
|
echo " Repos (v2 — cloned from git):"
|
|
if [ -d "$REPOS_DIR" ]; then
|
|
for d in "$REPOS_DIR"/*/; do
|
|
[ -d "$d" ] || continue
|
|
name=$(basename "$d")
|
|
repo=$(jq -r '.repo // "?"' "$d/manifest.json" 2>/dev/null)
|
|
desc=$(jq -r '.description // "no description"' "$d/manifest.json" 2>/dev/null || echo "no manifest")
|
|
printf " %-30s %s (%s)\n" "$name" "$desc" "$repo"
|
|
done
|
|
else
|
|
echo " (none)"
|
|
fi
|
|
}
|
|
|
|
setup_workspace_v1() {
|
|
local template=$1 label=$2
|
|
local workdir
|
|
workdir=$(mktemp -d)/eval-${label}-${template}
|
|
cp -r "$TEMPLATES_DIR/$template" "$workdir"
|
|
|
|
# Remove any leftover .venv / cache
|
|
rm -rf "$workdir/.venv" "$workdir/.pytest_cache" "$workdir/__pycache__"
|
|
|
|
# Install deps
|
|
(cd "$workdir" && uv sync --quiet 2>/dev/null)
|
|
|
|
echo "$workdir"
|
|
}
|
|
|
|
setup_workspace_v2() {
|
|
local eval_name=$1 label=$2 manifest=$3
|
|
local repo commit cache_dir workdir
|
|
repo=$(jq -r '.repo' "$manifest")
|
|
commit=$(jq -r '.commit' "$manifest")
|
|
cache_dir="$(dirname "$manifest")/workspace"
|
|
workdir=$(mktemp -d)/eval-${label}-${eval_name}
|
|
|
|
# Clone to cache dir if not already present
|
|
if [ ! -d "$cache_dir/.git" ]; then
|
|
echo "Cloning $repo at $commit (first run — caching to $cache_dir)..." >&2
|
|
gh repo clone "$repo" "$cache_dir" -- --quiet --no-checkout --depth 1
|
|
(cd "$cache_dir" \
|
|
&& git fetch --quiet --depth 1 origin "$commit" \
|
|
&& git checkout --quiet FETCH_HEAD)
|
|
else
|
|
echo "Using cached clone from $cache_dir" >&2
|
|
fi
|
|
|
|
# Copy cached workspace to isolated temp dir for this run
|
|
cp -r "$cache_dir" "$workdir"
|
|
|
|
echo "$workdir"
|
|
}
|
|
|
|
build_prompt() {
|
|
local manifest=$1
|
|
local version
|
|
version=$(jq -r '.version // 1' "$manifest")
|
|
|
|
# v2: prompt is in the manifest
|
|
if [ "$version" -ge 2 ]; then
|
|
jq -r '.prompt' "$manifest"
|
|
return
|
|
fi
|
|
|
|
# v1: derive prompt from eval_type
|
|
local eval_type
|
|
eval_type=$(jq -r '.eval_type' "$manifest")
|
|
|
|
case "$eval_type" in
|
|
crossdomain)
|
|
echo "The tests in this project are too slow. Profile the code, identify the performance bottlenecks in each module, and optimize them. Run the tests before and after to verify correctness and measure speedup."
|
|
;;
|
|
layered)
|
|
echo "test_process_large_batch is too slow and uses too much memory. Profile and optimize process_records. Run the test before and after to verify correctness and measure improvement."
|
|
;;
|
|
ranking|ranking-hard)
|
|
echo "test_large_batch in test_pipeline.py is too slow. Profile run_pipeline, find what's slow, and optimize. Run the test before and after to verify correctness and measure speedup."
|
|
;;
|
|
memory)
|
|
echo "process_dataset in test_large_dataset uses too much memory — peak RSS is 3x higher than the input data size. Profile memory usage, find what's allocating the most, and reduce peak memory. Run the test before and after to verify correctness."
|
|
;;
|
|
memory-hard)
|
|
echo "process_readings in test_large_dataset uses too much memory — peak is 3x higher than the input data. Profile memory usage step by step, find the dominant allocator, and reduce peak memory. Run the test before and after to verify correctness."
|
|
;;
|
|
memory-balanced)
|
|
echo "process_orders in test_large_batch uses too much memory — peak is over 2x the input data. Profile memory at each pipeline stage, find what's adding overhead, and reduce it. Run the test before and after to verify correctness."
|
|
;;
|
|
memory-misdirection)
|
|
echo "process_transactions in test_large_batch uses too much memory — peak is nearly 3x the input data. Profile memory usage, identify the biggest allocators in order of impact, and fix them. Run the test before and after to verify correctness."
|
|
;;
|
|
*)
|
|
echo "The tests in this project are too slow. Profile the code, find the performance bottlenecks, and optimize them. Run the tests before and after to verify correctness and measure improvement."
|
|
;;
|
|
esac
|
|
}
|
|
|
|
run_claude() {
|
|
local workdir=$1 label=$2 result_prefix=$3 use_plugin=$4
|
|
local start_time end_time duration
|
|
|
|
echo "=== Running $label in $workdir ==="
|
|
|
|
start_time=$(date +%s)
|
|
|
|
local prompt
|
|
prompt=$(cat "$workdir/.eval-prompt")
|
|
|
|
# When running with plugin, instruct Claude to invoke the skill
|
|
if [ "$use_plugin" = "true" ]; then
|
|
prompt="IMPORTANT: You MUST use the Skill tool to invoke '/codeflash-optimize' for this task. Do NOT solve the problem directly — delegate to the skill first. This is a non-interactive eval — there is no user to respond. When the skill tells you to ask the user a question, DO NOT ask and DO NOT stop. Instead, immediately launch the codeflash agent with the task below. No constraints apply. Task: $prompt"
|
|
fi
|
|
|
|
local claude_args=(-p "$prompt" --output-format json --verbose)
|
|
|
|
if [ "$use_plugin" = "true" ]; then
|
|
claude_args+=(--plugin-dir "$PLUGIN_DIR")
|
|
fi
|
|
|
|
# Run claude in the workspace dir
|
|
# --dangerously-skip-permissions: evals run in temp dirs, safe to allow all tools
|
|
(cd "$workdir" && claude "${claude_args[@]}" --dangerously-skip-permissions) \
|
|
> "${result_prefix}.json" 2> "${result_prefix}.stderr" || true
|
|
|
|
end_time=$(date +%s)
|
|
duration=$((end_time - start_time))
|
|
|
|
echo "$label completed in ${duration}s"
|
|
echo "$duration" > "${result_prefix}.duration"
|
|
|
|
# Run tests post-optimization to check correctness + timing
|
|
local test_cmd
|
|
test_cmd=$(jq -r '.test_command // empty' "$RUN_DIR/manifest.json")
|
|
if [ -n "$test_cmd" ]; then
|
|
echo "Running post-optimization tests..."
|
|
(cd "$workdir" && eval "$test_cmd") > "${result_prefix}.tests" 2>&1 || true
|
|
else
|
|
echo "No test_command in manifest — skipping post-optimization tests"
|
|
touch "${result_prefix}.tests"
|
|
fi
|
|
}
|
|
|
|
run_single() {
|
|
# Run a single eval iteration into $RUN_DIR
|
|
local eval_name=$1 mode=$2 manifest=$3 prompt=$4 version=$5
|
|
|
|
echo "$PROMPT" > /dev/null # ensure PROMPT is available
|
|
|
|
# --- With-skill run ---
|
|
if [ "$mode" != "--baseline-only" ]; then
|
|
local skill_workdir
|
|
if [ "$version" -ge 2 ]; then
|
|
skill_workdir=$(setup_workspace_v2 "$eval_name" "skill" "$manifest")
|
|
else
|
|
skill_workdir=$(setup_workspace_v1 "$eval_name" "skill")
|
|
fi
|
|
echo "$prompt" > "$skill_workdir/.eval-prompt"
|
|
run_claude "$skill_workdir" "with-skill" "$RUN_DIR/skill" "true"
|
|
echo ""
|
|
fi
|
|
|
|
# --- Baseline run ---
|
|
if [ "$mode" != "--skill-only" ]; then
|
|
local baseline_workdir
|
|
if [ "$version" -ge 2 ]; then
|
|
baseline_workdir=$(setup_workspace_v2 "$eval_name" "baseline" "$manifest")
|
|
else
|
|
baseline_workdir=$(setup_workspace_v1 "$eval_name" "baseline")
|
|
fi
|
|
echo "$prompt" > "$baseline_workdir/.eval-prompt"
|
|
run_claude "$baseline_workdir" "baseline" "$RUN_DIR/baseline" "false"
|
|
echo ""
|
|
fi
|
|
}
|
|
|
|
# --- Main ---
|
|
|
|
if [ "${1:-}" = "--list" ]; then
|
|
list_templates
|
|
exit 0
|
|
fi
|
|
|
|
# --- Parse args ---
|
|
|
|
eval_name=""
|
|
mode="--both"
|
|
num_runs=1
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--skill-only|--baseline-only)
|
|
mode="$1"
|
|
shift
|
|
;;
|
|
--runs)
|
|
num_runs="${2:?--runs requires a number}"
|
|
shift 2
|
|
;;
|
|
-*)
|
|
die "Unknown flag: $1"
|
|
;;
|
|
*)
|
|
eval_name="$1"
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
[[ -n "$eval_name" ]] || die "Usage: $0 <eval-name> [--skill-only|--baseline-only] [--runs N]"
|
|
|
|
EVAL_SOURCE_DIR=$(find_eval_dir "$eval_name")
|
|
[ -n "$EVAL_SOURCE_DIR" ] || die "Eval not found: $eval_name. Use --list to see available evals."
|
|
|
|
MANIFEST="$EVAL_SOURCE_DIR/manifest.json"
|
|
[ -f "$MANIFEST" ] || die "No manifest.json in: $EVAL_SOURCE_DIR"
|
|
|
|
VERSION=$(jq -r '.version // 1' "$MANIFEST")
|
|
|
|
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
|
|
|
# Build prompt
|
|
PROMPT=$(build_prompt "$MANIFEST")
|
|
|
|
if [ "$num_runs" -gt 1 ]; then
|
|
# Multi-run mode: create parent dir with run-1/, run-2/, etc.
|
|
PARENT_DIR="$RESULTS_DIR/${eval_name}-${TIMESTAMP}-${num_runs}runs"
|
|
mkdir -p "$PARENT_DIR"
|
|
cp "$MANIFEST" "$PARENT_DIR/manifest.json"
|
|
|
|
echo "Eval: $eval_name (v$VERSION) — $num_runs runs"
|
|
echo "Results: $PARENT_DIR"
|
|
echo "Prompt: $PROMPT"
|
|
echo ""
|
|
|
|
for i in $(seq 1 "$num_runs"); do
|
|
echo "========================================"
|
|
echo " Run $i / $num_runs"
|
|
echo "========================================"
|
|
|
|
RUN_DIR="$PARENT_DIR/run-$i"
|
|
mkdir -p "$RUN_DIR"
|
|
cp "$MANIFEST" "$RUN_DIR/manifest.json"
|
|
|
|
run_single "$eval_name" "$mode" "$MANIFEST" "$PROMPT" "$VERSION"
|
|
|
|
# Score this run immediately
|
|
"$EVAL_DIR/score-eval.sh" "$RUN_DIR"
|
|
echo ""
|
|
done
|
|
|
|
# Aggregate across runs
|
|
echo "========================================"
|
|
echo " Aggregating $num_runs runs"
|
|
echo "========================================"
|
|
python3 "$EVAL_DIR/score.py" aggregate "$PARENT_DIR"
|
|
|
|
echo "=== Results ==="
|
|
echo "Directory: $PARENT_DIR"
|
|
echo "Runs: $num_runs"
|
|
echo ""
|
|
echo "Files:"
|
|
ls -1 "$PARENT_DIR/"
|
|
else
|
|
# Single-run mode (original behavior)
|
|
RUN_DIR="$RESULTS_DIR/${eval_name}-${TIMESTAMP}"
|
|
mkdir -p "$RUN_DIR"
|
|
cp "$MANIFEST" "$RUN_DIR/manifest.json"
|
|
|
|
echo "Eval: $eval_name (v$VERSION)"
|
|
echo "Results: $RUN_DIR"
|
|
echo "Prompt: $PROMPT"
|
|
echo ""
|
|
|
|
run_single "$eval_name" "$mode" "$MANIFEST" "$PROMPT" "$VERSION"
|
|
|
|
echo "=== Results ==="
|
|
echo "Directory: $RUN_DIR"
|
|
echo ""
|
|
|
|
if [ -f "$RUN_DIR/skill.duration" ] && [ -f "$RUN_DIR/baseline.duration" ]; then
|
|
skill_dur=$(cat "$RUN_DIR/skill.duration")
|
|
baseline_dur=$(cat "$RUN_DIR/baseline.duration")
|
|
echo "With-skill duration: ${skill_dur}s"
|
|
echo "Baseline duration: ${baseline_dur}s"
|
|
fi
|
|
|
|
echo ""
|
|
echo "Files:"
|
|
ls -1 "$RUN_DIR/"
|
|
echo ""
|
|
echo "Next step: run ./score-eval.sh $RUN_DIR to score the results"
|
|
fi
|