#!/bin/bash set -euo pipefail # Eval regression checker for codeflash-agent plugin # # Runs evals, scores them, and compares to checked-in baselines. # Reports per-criterion regressions so you know exactly what broke. # Exits 1 if any score drops below the minimum threshold. # # Usage: # ./check-regression.sh # run all baseline evals (3 runs each) # ./check-regression.sh ranking # run specific template(s) # ./check-regression.sh --runs 1 # single run (faster, less reliable) # ./check-regression.sh --score-only # score existing results, skip running EVAL_DIR="$(cd "$(dirname "$0")" && pwd)" BASELINE_FILE="$EVAL_DIR/baseline-scores.json" die() { echo "ERROR: $*" >&2; exit 1; } [ -f "$BASELINE_FILE" ] || die "Baseline file not found: $BASELINE_FILE" # --- Parse args --- SCORE_ONLY="" RESULTS_DIRS=() TEMPLATES=() NUM_RUNS=3 while [[ $# -gt 0 ]]; do case "$1" in --score-only) SCORE_ONLY=1 shift while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do RESULTS_DIRS+=("$1") shift done ;; --runs) NUM_RUNS="${2:?--runs requires a number}" shift 2 ;; *) TEMPLATES+=("$1") shift ;; esac done # If no templates specified, use all from baseline file if [[ ${#TEMPLATES[@]} -eq 0 && -z "$SCORE_ONLY" ]]; then mapfile -t TEMPLATES < <(jq -r '.evals | keys[]' "$BASELINE_FILE") fi # --- Score-only mode --- if [[ -n "$SCORE_ONLY" ]]; then [[ ${#RESULTS_DIRS[@]} -gt 0 ]] || die "Usage: $0 --score-only [ ...]" for dir in "${RESULTS_DIRS[@]}"; do [ -d "$dir" ] || die "Results directory not found: $dir" echo "Scoring: $dir" # Check if this is a multi-run parent dir if ls "$dir"/run-*/ >/dev/null 2>&1; then for run_dir in "$dir"/run-*/; do "$EVAL_DIR/score-eval.sh" "$run_dir" done python3 "$EVAL_DIR/score.py" aggregate "$dir" else "$EVAL_DIR/score-eval.sh" "$dir" fi done echo "" echo "Scored ${#RESULTS_DIRS[@]} result(s). Compare manually or re-run without --score-only." exit 0 fi # --- Run evals --- echo "=== Eval Regression Check ===" echo "Templates: ${TEMPLATES[*]}" echo "Runs per eval: $NUM_RUNS" echo "Baseline: $BASELINE_FILE" echo "" declare -A RESULT_DIRS for template in "${TEMPLATES[@]}"; do # Verify template exists in baseline min=$(jq -r --arg t "$template" '.evals[$t].min // empty' "$BASELINE_FILE") [[ -n "$min" ]] || die "Template '$template' not found in baseline file" echo "--- Running: $template ---" run_args=("$template" --skill-only) if [[ "$NUM_RUNS" -gt 1 ]]; then run_args+=(--runs "$NUM_RUNS") fi output=$("$EVAL_DIR/run-eval.sh" "${run_args[@]}" 2>&1) echo "$output" # Extract the results directory from run-eval output result_dir=$(echo "$output" | grep "^Directory:" | head -1 | awk '{print $2}') # Fallback to "Results:" prefix if [[ -z "$result_dir" || ! -d "$result_dir" ]]; then result_dir=$(echo "$output" | grep "^Results:" | head -1 | awk '{print $2}') fi [[ -n "$result_dir" && -d "$result_dir" ]] || die "Could not find results directory for $template" RESULT_DIRS[$template]="$result_dir" echo "" done # --- Score (single-run only, multi-run scores during run-eval.sh) --- if [[ "$NUM_RUNS" -eq 1 ]]; then echo "=== Scoring ===" echo "" for template in "${TEMPLATES[@]}"; do result_dir="${RESULT_DIRS[$template]}" echo "--- Scoring: $template ---" "$EVAL_DIR/score-eval.sh" "$result_dir" echo "" done fi # --- Read scores --- declare -A SCORES read_scores() { local template=$1 local result_dir="${RESULT_DIRS[$template]}" if [[ "$NUM_RUNS" -gt 1 ]]; then # Multi-run: read from aggregate local agg_file="$result_dir/skill.aggregate.json" if [[ -f "$agg_file" ]]; then SCORES[$template]=$(jq -r '.total.avg | floor' "$agg_file") return fi fi # Single-run: read from score file local score_file="$result_dir/skill.score.json" if [[ -f "$score_file" ]]; then SCORES[$template]=$(jq -r '.total' "$score_file") else echo "WARNING: No score file for $template" SCORES[$template]="0" fi } for template in "${TEMPLATES[@]}"; do read_scores "$template" done # --- Compare to baseline (totals) --- echo "=== Regression Check ===" echo "" printf "%-25s %8s %8s %8s %10s\n" "Template" "Score" "Min" "Expected" "Status" printf "%-25s %8s %8s %8s %10s\n" "--------" "-----" "---" "--------" "------" FAILED=0 for template in "${TEMPLATES[@]}"; do score="${SCORES[$template]}" min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE") expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE") max=$(jq -r --arg t "$template" '.evals[$t].max' "$BASELINE_FILE") if [[ "$score" -lt "$min" ]]; then status="FAIL" FAILED=1 elif [[ "$score" -lt "$expected" ]]; then status="WARN" else status="PASS" fi printf "%-25s %8s %8s %8s %10s\n" "$template" "$score/$max" "$min" "$expected" "$status" done echo "" # --- Per-criterion regression check --- echo "=== Per-Criterion Breakdown ===" echo "" CRITERION_FAILURES=0 for template in "${TEMPLATES[@]}"; do result_dir="${RESULT_DIRS[$template]}" # Check if baseline has per-criterion data has_criteria=$(jq -r --arg t "$template" '.evals[$t].criteria // empty' "$BASELINE_FILE") [[ -n "$has_criteria" ]] || continue # Read actual criterion scores local_criteria="" if [[ "$NUM_RUNS" -gt 1 ]]; then agg_file="$result_dir/skill.aggregate.json" [[ -f "$agg_file" ]] && local_criteria=$(jq -r '.criteria' "$agg_file") else score_file="$result_dir/skill.score.json" [[ -f "$score_file" ]] && local_criteria=$(jq -r '.criteria' "$score_file") fi [[ -n "$local_criteria" ]] || continue echo "--- $template ---" # Get criterion names from baseline criteria_names=$(jq -r --arg t "$template" '.evals[$t].criteria | keys[]' "$BASELINE_FILE") for crit in $criteria_names; do crit_expected=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].expected' "$BASELINE_FILE") crit_min=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].min' "$BASELINE_FILE") # Get actual score if [[ "$NUM_RUNS" -gt 1 ]]; then actual=$(jq -r --arg c "$crit" '.criteria[$c].avg // 0' "$agg_file") stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev // 0' "$agg_file") actual_int=$(echo "$actual" | awk '{printf "%d", $1}') score_display="${actual} (stddev=${stddev})" else actual=$(echo "$local_criteria" | jq -r --arg c "$crit" '.[$c] // 0') actual_int="$actual" score_display="$actual" fi if [[ "$actual_int" -lt "$crit_min" ]]; then status="FAIL" CRITERION_FAILURES=$((CRITERION_FAILURES + 1)) elif [[ "$actual_int" -lt "$crit_expected" ]]; then status="WARN" else status="PASS" fi printf " %-40s %12s expected=%-3s min=%-3s %s\n" "$crit" "$score_display" "$crit_expected" "$crit_min" "$status" done echo "" done # --- Flaky criteria report (multi-run only) --- if [[ "$NUM_RUNS" -gt 1 ]]; then echo "=== Variance Report ===" echo "" any_flaky=0 for template in "${TEMPLATES[@]}"; do result_dir="${RESULT_DIRS[$template]}" agg_file="$result_dir/skill.aggregate.json" [[ -f "$agg_file" ]] || continue flaky=$(jq -r '.flaky_criteria // [] | .[]' "$agg_file" 2>/dev/null) if [[ -n "$flaky" ]]; then any_flaky=1 echo " $template:" for crit in $flaky; do scores=$(jq -r --arg c "$crit" '.criteria[$c].scores | map(tostring) | join(", ")' "$agg_file") stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev' "$agg_file") echo " $crit: [$scores] stddev=$stddev" done fi done if [[ "$any_flaky" -eq 0 ]]; then echo " No flaky criteria detected across $NUM_RUNS runs." fi echo "" fi # --- Write summary for CI --- SUMMARY_FILE="$EVAL_DIR/results/regression-summary.json" mkdir -p "$(dirname "$SUMMARY_FILE")" { echo "{" echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," echo " \"runs_per_eval\": $NUM_RUNS," echo " \"passed\": $([ $FAILED -eq 0 ] && [ $CRITERION_FAILURES -eq 0 ] && echo true || echo false)," echo " \"total_regressions\": $FAILED," echo " \"criterion_regressions\": $CRITERION_FAILURES," echo " \"results\": {" first=1 for template in "${TEMPLATES[@]}"; do [ $first -eq 0 ] && echo "," first=0 score="${SCORES[$template]}" min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE") expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE") printf ' "%s": { "score": %s, "min": %s, "expected": %s }' "$template" "$score" "$min" "$expected" done echo "" echo " }" echo "}" } > "$SUMMARY_FILE" echo "Summary: $SUMMARY_FILE" if [[ $FAILED -eq 1 || $CRITERION_FAILURES -gt 0 ]]; then echo "" if [[ $FAILED -eq 1 ]]; then echo "REGRESSION DETECTED: Total score below minimum threshold." fi if [[ $CRITERION_FAILURES -gt 0 ]]; then echo "CRITERION REGRESSION: $CRITERION_FAILURES criterion(s) below minimum threshold." fi exit 1 fi echo "" echo "All evals passed regression check." exit 0