321 lines
9.8 KiB
Bash
Executable file
321 lines
9.8 KiB
Bash
Executable file
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
# Eval regression checker for codeflash-agent plugin
|
|
#
|
|
# Runs evals, scores them, and compares to checked-in baselines.
|
|
# Reports per-criterion regressions so you know exactly what broke.
|
|
# Exits 1 if any score drops below the minimum threshold.
|
|
#
|
|
# Usage:
|
|
# ./check-regression.sh # run all baseline evals (3 runs each)
|
|
# ./check-regression.sh ranking # run specific template(s)
|
|
# ./check-regression.sh --runs 1 # single run (faster, less reliable)
|
|
# ./check-regression.sh --score-only <dir> # score existing results, skip running
|
|
|
|
EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
BASELINE_FILE="$EVAL_DIR/baseline-scores.json"
|
|
|
|
die() { echo "ERROR: $*" >&2; exit 1; }
|
|
|
|
[ -f "$BASELINE_FILE" ] || die "Baseline file not found: $BASELINE_FILE"
|
|
|
|
# --- Parse args ---
|
|
|
|
SCORE_ONLY=""
|
|
RESULTS_DIRS=()
|
|
TEMPLATES=()
|
|
NUM_RUNS=3
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--score-only)
|
|
SCORE_ONLY=1
|
|
shift
|
|
while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
|
|
RESULTS_DIRS+=("$1")
|
|
shift
|
|
done
|
|
;;
|
|
--runs)
|
|
NUM_RUNS="${2:?--runs requires a number}"
|
|
shift 2
|
|
;;
|
|
*)
|
|
TEMPLATES+=("$1")
|
|
shift
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# If no templates specified, use all from baseline file
|
|
if [[ ${#TEMPLATES[@]} -eq 0 && -z "$SCORE_ONLY" ]]; then
|
|
mapfile -t TEMPLATES < <(jq -r '.evals | keys[]' "$BASELINE_FILE")
|
|
fi
|
|
|
|
# --- Score-only mode ---
|
|
|
|
if [[ -n "$SCORE_ONLY" ]]; then
|
|
[[ ${#RESULTS_DIRS[@]} -gt 0 ]] || die "Usage: $0 --score-only <results-dir> [<results-dir> ...]"
|
|
|
|
for dir in "${RESULTS_DIRS[@]}"; do
|
|
[ -d "$dir" ] || die "Results directory not found: $dir"
|
|
echo "Scoring: $dir"
|
|
|
|
# Check if this is a multi-run parent dir
|
|
if ls "$dir"/run-*/ >/dev/null 2>&1; then
|
|
for run_dir in "$dir"/run-*/; do
|
|
"$EVAL_DIR/score-eval.sh" "$run_dir"
|
|
done
|
|
python3 "$EVAL_DIR/score.py" aggregate "$dir"
|
|
else
|
|
"$EVAL_DIR/score-eval.sh" "$dir"
|
|
fi
|
|
done
|
|
|
|
echo ""
|
|
echo "Scored ${#RESULTS_DIRS[@]} result(s). Compare manually or re-run without --score-only."
|
|
exit 0
|
|
fi
|
|
|
|
# --- Run evals ---
|
|
|
|
echo "=== Eval Regression Check ==="
|
|
echo "Templates: ${TEMPLATES[*]}"
|
|
echo "Runs per eval: $NUM_RUNS"
|
|
echo "Baseline: $BASELINE_FILE"
|
|
echo ""
|
|
|
|
declare -A RESULT_DIRS
|
|
|
|
for template in "${TEMPLATES[@]}"; do
|
|
# Verify template exists in baseline
|
|
min=$(jq -r --arg t "$template" '.evals[$t].min // empty' "$BASELINE_FILE")
|
|
[[ -n "$min" ]] || die "Template '$template' not found in baseline file"
|
|
|
|
echo "--- Running: $template ---"
|
|
run_args=("$template" --skill-only)
|
|
if [[ "$NUM_RUNS" -gt 1 ]]; then
|
|
run_args+=(--runs "$NUM_RUNS")
|
|
fi
|
|
output=$("$EVAL_DIR/run-eval.sh" "${run_args[@]}" 2>&1)
|
|
echo "$output"
|
|
|
|
# Extract the results directory from run-eval output
|
|
result_dir=$(echo "$output" | grep "^Directory:" | head -1 | awk '{print $2}')
|
|
# Fallback to "Results:" prefix
|
|
if [[ -z "$result_dir" || ! -d "$result_dir" ]]; then
|
|
result_dir=$(echo "$output" | grep "^Results:" | head -1 | awk '{print $2}')
|
|
fi
|
|
[[ -n "$result_dir" && -d "$result_dir" ]] || die "Could not find results directory for $template"
|
|
RESULT_DIRS[$template]="$result_dir"
|
|
echo ""
|
|
done
|
|
|
|
# --- Score (single-run only, multi-run scores during run-eval.sh) ---
|
|
|
|
if [[ "$NUM_RUNS" -eq 1 ]]; then
|
|
echo "=== Scoring ==="
|
|
echo ""
|
|
|
|
for template in "${TEMPLATES[@]}"; do
|
|
result_dir="${RESULT_DIRS[$template]}"
|
|
echo "--- Scoring: $template ---"
|
|
"$EVAL_DIR/score-eval.sh" "$result_dir"
|
|
echo ""
|
|
done
|
|
fi
|
|
|
|
# --- Read scores ---
|
|
|
|
declare -A SCORES
|
|
|
|
read_scores() {
|
|
local template=$1
|
|
local result_dir="${RESULT_DIRS[$template]}"
|
|
|
|
if [[ "$NUM_RUNS" -gt 1 ]]; then
|
|
# Multi-run: read from aggregate
|
|
local agg_file="$result_dir/skill.aggregate.json"
|
|
if [[ -f "$agg_file" ]]; then
|
|
SCORES[$template]=$(jq -r '.total.avg | floor' "$agg_file")
|
|
return
|
|
fi
|
|
fi
|
|
|
|
# Single-run: read from score file
|
|
local score_file="$result_dir/skill.score.json"
|
|
if [[ -f "$score_file" ]]; then
|
|
SCORES[$template]=$(jq -r '.total' "$score_file")
|
|
else
|
|
echo "WARNING: No score file for $template"
|
|
SCORES[$template]="0"
|
|
fi
|
|
}
|
|
|
|
for template in "${TEMPLATES[@]}"; do
|
|
read_scores "$template"
|
|
done
|
|
|
|
# --- Compare to baseline (totals) ---
|
|
|
|
echo "=== Regression Check ==="
|
|
echo ""
|
|
printf "%-25s %8s %8s %8s %10s\n" "Template" "Score" "Min" "Expected" "Status"
|
|
printf "%-25s %8s %8s %8s %10s\n" "--------" "-----" "---" "--------" "------"
|
|
|
|
FAILED=0
|
|
|
|
for template in "${TEMPLATES[@]}"; do
|
|
score="${SCORES[$template]}"
|
|
min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE")
|
|
expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE")
|
|
max=$(jq -r --arg t "$template" '.evals[$t].max' "$BASELINE_FILE")
|
|
|
|
if [[ "$score" -lt "$min" ]]; then
|
|
status="FAIL"
|
|
FAILED=1
|
|
elif [[ "$score" -lt "$expected" ]]; then
|
|
status="WARN"
|
|
else
|
|
status="PASS"
|
|
fi
|
|
|
|
printf "%-25s %8s %8s %8s %10s\n" "$template" "$score/$max" "$min" "$expected" "$status"
|
|
done
|
|
|
|
echo ""
|
|
|
|
# --- Per-criterion regression check ---
|
|
|
|
echo "=== Per-Criterion Breakdown ==="
|
|
echo ""
|
|
|
|
CRITERION_FAILURES=0
|
|
|
|
for template in "${TEMPLATES[@]}"; do
|
|
result_dir="${RESULT_DIRS[$template]}"
|
|
|
|
# Check if baseline has per-criterion data
|
|
has_criteria=$(jq -r --arg t "$template" '.evals[$t].criteria // empty' "$BASELINE_FILE")
|
|
[[ -n "$has_criteria" ]] || continue
|
|
|
|
# Read actual criterion scores
|
|
local_criteria=""
|
|
if [[ "$NUM_RUNS" -gt 1 ]]; then
|
|
agg_file="$result_dir/skill.aggregate.json"
|
|
[[ -f "$agg_file" ]] && local_criteria=$(jq -r '.criteria' "$agg_file")
|
|
else
|
|
score_file="$result_dir/skill.score.json"
|
|
[[ -f "$score_file" ]] && local_criteria=$(jq -r '.criteria' "$score_file")
|
|
fi
|
|
[[ -n "$local_criteria" ]] || continue
|
|
|
|
echo "--- $template ---"
|
|
|
|
# Get criterion names from baseline
|
|
criteria_names=$(jq -r --arg t "$template" '.evals[$t].criteria | keys[]' "$BASELINE_FILE")
|
|
|
|
for crit in $criteria_names; do
|
|
crit_expected=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].expected' "$BASELINE_FILE")
|
|
crit_min=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].min' "$BASELINE_FILE")
|
|
|
|
# Get actual score
|
|
if [[ "$NUM_RUNS" -gt 1 ]]; then
|
|
actual=$(jq -r --arg c "$crit" '.criteria[$c].avg // 0' "$agg_file")
|
|
stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev // 0' "$agg_file")
|
|
actual_int=$(echo "$actual" | awk '{printf "%d", $1}')
|
|
score_display="${actual} (stddev=${stddev})"
|
|
else
|
|
actual=$(echo "$local_criteria" | jq -r --arg c "$crit" '.[$c] // 0')
|
|
actual_int="$actual"
|
|
score_display="$actual"
|
|
fi
|
|
|
|
if [[ "$actual_int" -lt "$crit_min" ]]; then
|
|
status="FAIL"
|
|
CRITERION_FAILURES=$((CRITERION_FAILURES + 1))
|
|
elif [[ "$actual_int" -lt "$crit_expected" ]]; then
|
|
status="WARN"
|
|
else
|
|
status="PASS"
|
|
fi
|
|
|
|
printf " %-40s %12s expected=%-3s min=%-3s %s\n" "$crit" "$score_display" "$crit_expected" "$crit_min" "$status"
|
|
done
|
|
echo ""
|
|
done
|
|
|
|
# --- Flaky criteria report (multi-run only) ---
|
|
|
|
if [[ "$NUM_RUNS" -gt 1 ]]; then
|
|
echo "=== Variance Report ==="
|
|
echo ""
|
|
any_flaky=0
|
|
|
|
for template in "${TEMPLATES[@]}"; do
|
|
result_dir="${RESULT_DIRS[$template]}"
|
|
agg_file="$result_dir/skill.aggregate.json"
|
|
[[ -f "$agg_file" ]] || continue
|
|
|
|
flaky=$(jq -r '.flaky_criteria // [] | .[]' "$agg_file" 2>/dev/null)
|
|
if [[ -n "$flaky" ]]; then
|
|
any_flaky=1
|
|
echo " $template:"
|
|
for crit in $flaky; do
|
|
scores=$(jq -r --arg c "$crit" '.criteria[$c].scores | map(tostring) | join(", ")' "$agg_file")
|
|
stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev' "$agg_file")
|
|
echo " $crit: [$scores] stddev=$stddev"
|
|
done
|
|
fi
|
|
done
|
|
|
|
if [[ "$any_flaky" -eq 0 ]]; then
|
|
echo " No flaky criteria detected across $NUM_RUNS runs."
|
|
fi
|
|
echo ""
|
|
fi
|
|
|
|
# --- Write summary for CI ---
|
|
|
|
SUMMARY_FILE="$EVAL_DIR/results/regression-summary.json"
|
|
mkdir -p "$(dirname "$SUMMARY_FILE")"
|
|
|
|
{
|
|
echo "{"
|
|
echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
|
|
echo " \"runs_per_eval\": $NUM_RUNS,"
|
|
echo " \"passed\": $([ $FAILED -eq 0 ] && [ $CRITERION_FAILURES -eq 0 ] && echo true || echo false),"
|
|
echo " \"total_regressions\": $FAILED,"
|
|
echo " \"criterion_regressions\": $CRITERION_FAILURES,"
|
|
echo " \"results\": {"
|
|
first=1
|
|
for template in "${TEMPLATES[@]}"; do
|
|
[ $first -eq 0 ] && echo ","
|
|
first=0
|
|
score="${SCORES[$template]}"
|
|
min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE")
|
|
expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE")
|
|
printf ' "%s": { "score": %s, "min": %s, "expected": %s }' "$template" "$score" "$min" "$expected"
|
|
done
|
|
echo ""
|
|
echo " }"
|
|
echo "}"
|
|
} > "$SUMMARY_FILE"
|
|
|
|
echo "Summary: $SUMMARY_FILE"
|
|
|
|
if [[ $FAILED -eq 1 || $CRITERION_FAILURES -gt 0 ]]; then
|
|
echo ""
|
|
if [[ $FAILED -eq 1 ]]; then
|
|
echo "REGRESSION DETECTED: Total score below minimum threshold."
|
|
fi
|
|
if [[ $CRITERION_FAILURES -gt 0 ]]; then
|
|
echo "CRITERION REGRESSION: $CRITERION_FAILURES criterion(s) below minimum threshold."
|
|
fi
|
|
exit 1
|
|
fi
|
|
|
|
echo ""
|
|
echo "All evals passed regression check."
|
|
exit 0
|