codeflash-agent/evals/check-regression.sh

#!/bin/bash
set -euo pipefail

# Eval regression checker for codeflash-agent plugin
#
# Runs evals, scores them, and compares to checked-in baselines.
# Reports per-criterion regressions so you know exactly what broke.
# Exits 1 if any score drops below the minimum threshold.
#
# Usage:
#   ./check-regression.sh                         # run all baseline evals (3 runs each)
#   ./check-regression.sh ranking                 # run specific template(s)
#   ./check-regression.sh --runs 1                # single run (faster, less reliable)
#   ./check-regression.sh --score-only <dir>      # score existing results, skip running

EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"
BASELINE_FILE="$EVAL_DIR/baseline-scores.json"

die() { echo "ERROR: $*" >&2; exit 1; }

[ -f "$BASELINE_FILE" ] || die "Baseline file not found: $BASELINE_FILE"

# --- Parse args ---

SCORE_ONLY=""
RESULTS_DIRS=()
TEMPLATES=()
NUM_RUNS=3

while [[ $# -gt 0 ]]; do
    case "$1" in
        --score-only)
            SCORE_ONLY=1
            shift
            while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do
                RESULTS_DIRS+=("$1")
                shift
            done
            ;;
        --runs)
            NUM_RUNS="${2:?--runs requires a number}"
            shift 2
            ;;
        *)
            TEMPLATES+=("$1")
            shift
            ;;
    esac
done

# If no templates specified, use all from baseline file
if [[ ${#TEMPLATES[@]} -eq 0 && -z "$SCORE_ONLY" ]]; then
    mapfile -t TEMPLATES < <(jq -r '.evals | keys[]' "$BASELINE_FILE")
fi

# --- Score-only mode ---

if [[ -n "$SCORE_ONLY" ]]; then
    [[ ${#RESULTS_DIRS[@]} -gt 0 ]] || die "Usage: $0 --score-only <results-dir> [<results-dir> ...]"

    for dir in "${RESULTS_DIRS[@]}"; do
        [ -d "$dir" ] || die "Results directory not found: $dir"
        echo "Scoring: $dir"

        # Check if this is a multi-run parent dir
        if ls "$dir"/run-*/ >/dev/null 2>&1; then
            for run_dir in "$dir"/run-*/; do
                "$EVAL_DIR/score-eval.sh" "$run_dir"
            done
            python3 "$EVAL_DIR/score.py" aggregate "$dir"
        else
            "$EVAL_DIR/score-eval.sh" "$dir"
        fi
    done

    echo ""
    echo "Scored ${#RESULTS_DIRS[@]} result(s). Compare manually or re-run without --score-only."
    exit 0
fi

# --- Run evals ---

echo "=== Eval Regression Check ==="
echo "Templates: ${TEMPLATES[*]}"
echo "Runs per eval: $NUM_RUNS"
echo "Baseline:  $BASELINE_FILE"
echo ""

declare -A RESULT_DIRS

for template in "${TEMPLATES[@]}"; do
    # Verify template exists in baseline
    min=$(jq -r --arg t "$template" '.evals[$t].min // empty' "$BASELINE_FILE")
    [[ -n "$min" ]] || die "Template '$template' not found in baseline file"

    echo "--- Running: $template ---"
    run_args=("$template" --skill-only)
    if [[ "$NUM_RUNS" -gt 1 ]]; then
        run_args+=(--runs "$NUM_RUNS")
    fi
    output=$("$EVAL_DIR/run-eval.sh" "${run_args[@]}" 2>&1)
    echo "$output"

    # Extract the results directory from run-eval output
    result_dir=$(echo "$output" | grep "^Directory:" | head -1 | awk '{print $2}')
    # Fallback to "Results:" prefix
    if [[ -z "$result_dir" || ! -d "$result_dir" ]]; then
        result_dir=$(echo "$output" | grep "^Results:" | head -1 | awk '{print $2}')
    fi
    [[ -n "$result_dir" && -d "$result_dir" ]] || die "Could not find results directory for $template"
    RESULT_DIRS[$template]="$result_dir"
    echo ""
done

# --- Score (single-run only, multi-run scores during run-eval.sh) ---

if [[ "$NUM_RUNS" -eq 1 ]]; then
    echo "=== Scoring ==="
    echo ""

    for template in "${TEMPLATES[@]}"; do
        result_dir="${RESULT_DIRS[$template]}"
        echo "--- Scoring: $template ---"
        "$EVAL_DIR/score-eval.sh" "$result_dir"
        echo ""
    done
fi

# --- Read scores ---

declare -A SCORES

read_scores() {
    local template=$1
    local result_dir="${RESULT_DIRS[$template]}"

    if [[ "$NUM_RUNS" -gt 1 ]]; then
        # Multi-run: read from aggregate
        local agg_file="$result_dir/skill.aggregate.json"
        if [[ -f "$agg_file" ]]; then
            SCORES[$template]=$(jq -r '.total.avg | floor' "$agg_file")
            return
        fi
    fi

    # Single-run: read from score file
    local score_file="$result_dir/skill.score.json"
    if [[ -f "$score_file" ]]; then
        SCORES[$template]=$(jq -r '.total' "$score_file")
    else
        echo "WARNING: No score file for $template"
        SCORES[$template]="0"
    fi
}

for template in "${TEMPLATES[@]}"; do
    read_scores "$template"
done

# --- Compare to baseline (totals) ---

echo "=== Regression Check ==="
echo ""
printf "%-25s %8s %8s %8s %10s\n" "Template" "Score" "Min" "Expected" "Status"
printf "%-25s %8s %8s %8s %10s\n" "--------" "-----" "---" "--------" "------"

FAILED=0

for template in "${TEMPLATES[@]}"; do
    score="${SCORES[$template]}"
    min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE")
    expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE")
    max=$(jq -r --arg t "$template" '.evals[$t].max' "$BASELINE_FILE")

    if [[ "$score" -lt "$min" ]]; then
        status="FAIL"
        FAILED=1
    elif [[ "$score" -lt "$expected" ]]; then
        status="WARN"
    else
        status="PASS"
    fi

    printf "%-25s %8s %8s %8s %10s\n" "$template" "$score/$max" "$min" "$expected" "$status"
done

echo ""

# --- Per-criterion regression check ---

echo "=== Per-Criterion Breakdown ==="
echo ""

CRITERION_FAILURES=0

for template in "${TEMPLATES[@]}"; do
    result_dir="${RESULT_DIRS[$template]}"

    # Check if baseline has per-criterion data
    has_criteria=$(jq -r --arg t "$template" '.evals[$t].criteria // empty' "$BASELINE_FILE")
    [[ -n "$has_criteria" ]] || continue

    # Read actual criterion scores
    local_criteria=""
    if [[ "$NUM_RUNS" -gt 1 ]]; then
        agg_file="$result_dir/skill.aggregate.json"
        [[ -f "$agg_file" ]] && local_criteria=$(jq -r '.criteria' "$agg_file")
    else
        score_file="$result_dir/skill.score.json"
        [[ -f "$score_file" ]] && local_criteria=$(jq -r '.criteria' "$score_file")
    fi
    [[ -n "$local_criteria" ]] || continue

    echo "--- $template ---"

    # Get criterion names from baseline
    criteria_names=$(jq -r --arg t "$template" '.evals[$t].criteria | keys[]' "$BASELINE_FILE")

    for crit in $criteria_names; do
        crit_expected=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].expected' "$BASELINE_FILE")
        crit_min=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].min' "$BASELINE_FILE")

        # Get actual score
        if [[ "$NUM_RUNS" -gt 1 ]]; then
            actual=$(jq -r --arg c "$crit" '.criteria[$c].avg // 0' "$agg_file")
            stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev // 0' "$agg_file")
            actual_int=$(echo "$actual" | awk '{printf "%d", $1}')
            score_display="${actual} (stddev=${stddev})"
        else
            actual=$(echo "$local_criteria" | jq -r --arg c "$crit" '.[$c] // 0')
            actual_int="$actual"
            score_display="$actual"
        fi

        if [[ "$actual_int" -lt "$crit_min" ]]; then
            status="FAIL"
            CRITERION_FAILURES=$((CRITERION_FAILURES + 1))
        elif [[ "$actual_int" -lt "$crit_expected" ]]; then
            status="WARN"
        else
            status="PASS"
        fi

        printf "  %-40s %12s  expected=%-3s min=%-3s %s\n" "$crit" "$score_display" "$crit_expected" "$crit_min" "$status"
    done
    echo ""
done

# --- Flaky criteria report (multi-run only) ---

if [[ "$NUM_RUNS" -gt 1 ]]; then
    echo "=== Variance Report ==="
    echo ""
    any_flaky=0

    for template in "${TEMPLATES[@]}"; do
        result_dir="${RESULT_DIRS[$template]}"
        agg_file="$result_dir/skill.aggregate.json"
        [[ -f "$agg_file" ]] || continue

        flaky=$(jq -r '.flaky_criteria // [] | .[]' "$agg_file" 2>/dev/null)
        if [[ -n "$flaky" ]]; then
            any_flaky=1
            echo "  $template:"
            for crit in $flaky; do
                scores=$(jq -r --arg c "$crit" '.criteria[$c].scores | map(tostring) | join(", ")' "$agg_file")
                stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev' "$agg_file")
                echo "    $crit: [$scores] stddev=$stddev"
            done
        fi
    done

    if [[ "$any_flaky" -eq 0 ]]; then
        echo "  No flaky criteria detected across $NUM_RUNS runs."
    fi
    echo ""
fi

# --- Write summary for CI ---

SUMMARY_FILE="$EVAL_DIR/results/regression-summary.json"
mkdir -p "$(dirname "$SUMMARY_FILE")"

{
    echo "{"
    echo "  \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
    echo "  \"runs_per_eval\": $NUM_RUNS,"
    echo "  \"passed\": $([ $FAILED -eq 0 ] && [ $CRITERION_FAILURES -eq 0 ] && echo true || echo false),"
    echo "  \"total_regressions\": $FAILED,"
    echo "  \"criterion_regressions\": $CRITERION_FAILURES,"
    echo "  \"results\": {"
    first=1
    for template in "${TEMPLATES[@]}"; do
        [ $first -eq 0 ] && echo ","
        first=0
        score="${SCORES[$template]}"
        min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE")
        expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE")
        printf '    "%s": { "score": %s, "min": %s, "expected": %s }' "$template" "$score" "$min" "$expected"
    done
    echo ""
    echo "  }"
    echo "}"
} > "$SUMMARY_FILE"

echo "Summary: $SUMMARY_FILE"

if [[ $FAILED -eq 1 || $CRITERION_FAILURES -gt 0 ]]; then
    echo ""
    if [[ $FAILED -eq 1 ]]; then
        echo "REGRESSION DETECTED: Total score below minimum threshold."
    fi
    if [[ $CRITERION_FAILURES -gt 0 ]]; then
        echo "CRITERION REGRESSION: $CRITERION_FAILURES criterion(s) below minimum threshold."
    fi
    exit 1
fi

echo ""
echo "All evals passed regression check."
exit 0
feat: improve skill, eval system, and tessl config - Optimize codeflash-optimize SKILL.md (review score 17% → 98%, eval 87% → 100%) - Fix frontmatter (allowed-tools format, argument-hint under metadata) - Lead description with concrete actions, explicit agent launch parameters - Add multi-run variance detection to eval system (--runs N flag) - score.py aggregate command: min/max/avg/stddev per criterion, flaky detection - check-regression.sh defaults to 3 runs for reliable regression detection - Add per-criterion regression tracking to baseline-scores.json (v3) - Reports exactly which criteria regressed, not just total score drops - Rename evals/ → codeflash-evals/ to avoid tessl directory conflicts - Switch tessl to managed mode, gitignore vendored tiles and symlinks 2026-03-27 16:03:27 +00:00			`#!/bin/bash`
			`set -euo pipefail`

			`# Eval regression checker for codeflash-agent plugin`
			`#`
			`# Runs evals, scores them, and compares to checked-in baselines.`
			`# Reports per-criterion regressions so you know exactly what broke.`
			`# Exits 1 if any score drops below the minimum threshold.`
			`#`
			`# Usage:`
			`# ./check-regression.sh # run all baseline evals (3 runs each)`
			`# ./check-regression.sh ranking # run specific template(s)`
			`# ./check-regression.sh --runs 1 # single run (faster, less reliable)`
			`# ./check-regression.sh --score-only <dir> # score existing results, skip running`

			`EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"`
			`BASELINE_FILE="$EVAL_DIR/baseline-scores.json"`

			`die() { echo "ERROR: $*" >&2; exit 1; }`

			`[ -f "$BASELINE_FILE" ] \|\| die "Baseline file not found: $BASELINE_FILE"`

			`# --- Parse args ---`

			`SCORE_ONLY=""`
			`RESULTS_DIRS=()`
			`TEMPLATES=()`
			`NUM_RUNS=3`

			`while [[ $# -gt 0 ]]; do`
			`case "$1" in`
			`--score-only)`
			`SCORE_ONLY=1`
			`shift`
			`while [[ $# -gt 0 && ! "$1" =~ ^-- ]]; do`
			`RESULTS_DIRS+=("$1")`
			`shift`
			`done`
			`;;`
			`--runs)`
			`NUM_RUNS="${2:?--runs requires a number}"`
			`shift 2`
			`;;`
			`*)`
			`TEMPLATES+=("$1")`
			`shift`
			`;;`
			`esac`
			`done`

			`# If no templates specified, use all from baseline file`
			`if [[ ${#TEMPLATES[@]} -eq 0 && -z "$SCORE_ONLY" ]]; then`
			`mapfile -t TEMPLATES < <(jq -r '.evals \| keys[]' "$BASELINE_FILE")`
			`fi`

			`# --- Score-only mode ---`

			`if [[ -n "$SCORE_ONLY" ]]; then`
			`[[ ${#RESULTS_DIRS[@]} -gt 0 ]] \|\| die "Usage: $0 --score-only <results-dir> [<results-dir> ...]"`

			`for dir in "${RESULTS_DIRS[@]}"; do`
			`[ -d "$dir" ] \|\| die "Results directory not found: $dir"`
			`echo "Scoring: $dir"`

			`# Check if this is a multi-run parent dir`
			`if ls "$dir"/run-*/ >/dev/null 2>&1; then`
			`for run_dir in "$dir"/run-*/; do`
			`"$EVAL_DIR/score-eval.sh" "$run_dir"`
			`done`
			`python3 "$EVAL_DIR/score.py" aggregate "$dir"`
			`else`
			`"$EVAL_DIR/score-eval.sh" "$dir"`
			`fi`
			`done`

			`echo ""`
			`echo "Scored ${#RESULTS_DIRS[@]} result(s). Compare manually or re-run without --score-only."`
			`exit 0`
			`fi`

			`# --- Run evals ---`

			`echo "=== Eval Regression Check ==="`
			`echo "Templates: ${TEMPLATES[*]}"`
			`echo "Runs per eval: $NUM_RUNS"`
			`echo "Baseline: $BASELINE_FILE"`
			`echo ""`

			`declare -A RESULT_DIRS`

			`for template in "${TEMPLATES[@]}"; do`
			`# Verify template exists in baseline`
			`min=$(jq -r --arg t "$template" '.evals[$t].min // empty' "$BASELINE_FILE")`
			`[[ -n "$min" ]] \|\| die "Template '$template' not found in baseline file"`

			`echo "--- Running: $template ---"`
			`run_args=("$template" --skill-only)`
			`if [[ "$NUM_RUNS" -gt 1 ]]; then`
			`run_args+=(--runs "$NUM_RUNS")`
			`fi`
			`output=$("$EVAL_DIR/run-eval.sh" "${run_args[@]}" 2>&1)`
			`echo "$output"`

			`# Extract the results directory from run-eval output`
			`result_dir=$(echo "$output" \| grep "^Directory:" \| head -1 \| awk '{print $2}')`
			`# Fallback to "Results:" prefix`
			`if [[ -z "$result_dir" \|\| ! -d "$result_dir" ]]; then`
			`result_dir=$(echo "$output" \| grep "^Results:" \| head -1 \| awk '{print $2}')`
			`fi`
			`[[ -n "$result_dir" && -d "$result_dir" ]] \|\| die "Could not find results directory for $template"`
			`RESULT_DIRS[$template]="$result_dir"`
			`echo ""`
			`done`

			`# --- Score (single-run only, multi-run scores during run-eval.sh) ---`

			`if [[ "$NUM_RUNS" -eq 1 ]]; then`
			`echo "=== Scoring ==="`
			`echo ""`

			`for template in "${TEMPLATES[@]}"; do`
			`result_dir="${RESULT_DIRS[$template]}"`
			`echo "--- Scoring: $template ---"`
			`"$EVAL_DIR/score-eval.sh" "$result_dir"`
			`echo ""`
			`done`
			`fi`

			`# --- Read scores ---`

			`declare -A SCORES`

			`read_scores() {`
			`local template=$1`
			`local result_dir="${RESULT_DIRS[$template]}"`

			`if [[ "$NUM_RUNS" -gt 1 ]]; then`
			`# Multi-run: read from aggregate`
			`local agg_file="$result_dir/skill.aggregate.json"`
			`if [[ -f "$agg_file" ]]; then`
			`SCORES[$template]=$(jq -r '.total.avg \| floor' "$agg_file")`
			`return`
			`fi`
			`fi`

			`# Single-run: read from score file`
			`local score_file="$result_dir/skill.score.json"`
			`if [[ -f "$score_file" ]]; then`
			`SCORES[$template]=$(jq -r '.total' "$score_file")`
			`else`
			`echo "WARNING: No score file for $template"`
			`SCORES[$template]="0"`
			`fi`
			`}`

			`for template in "${TEMPLATES[@]}"; do`
			`read_scores "$template"`
			`done`

			`# --- Compare to baseline (totals) ---`

			`echo "=== Regression Check ==="`
			`echo ""`
			`printf "%-25s %8s %8s %8s %10s\n" "Template" "Score" "Min" "Expected" "Status"`
			`printf "%-25s %8s %8s %8s %10s\n" "--------" "-----" "---" "--------" "------"`

			`FAILED=0`

			`for template in "${TEMPLATES[@]}"; do`
			`score="${SCORES[$template]}"`
			`min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE")`
			`expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE")`
			`max=$(jq -r --arg t "$template" '.evals[$t].max' "$BASELINE_FILE")`

			`if [[ "$score" -lt "$min" ]]; then`
			`status="FAIL"`
			`FAILED=1`
			`elif [[ "$score" -lt "$expected" ]]; then`
			`status="WARN"`
			`else`
			`status="PASS"`
			`fi`

			`printf "%-25s %8s %8s %8s %10s\n" "$template" "$score/$max" "$min" "$expected" "$status"`
			`done`

			`echo ""`

			`# --- Per-criterion regression check ---`

			`echo "=== Per-Criterion Breakdown ==="`
			`echo ""`

			`CRITERION_FAILURES=0`

			`for template in "${TEMPLATES[@]}"; do`
			`result_dir="${RESULT_DIRS[$template]}"`

			`# Check if baseline has per-criterion data`
			`has_criteria=$(jq -r --arg t "$template" '.evals[$t].criteria // empty' "$BASELINE_FILE")`
			`[[ -n "$has_criteria" ]] \|\| continue`

			`# Read actual criterion scores`
			`local_criteria=""`
			`if [[ "$NUM_RUNS" -gt 1 ]]; then`
			`agg_file="$result_dir/skill.aggregate.json"`
			`[[ -f "$agg_file" ]] && local_criteria=$(jq -r '.criteria' "$agg_file")`
			`else`
			`score_file="$result_dir/skill.score.json"`
			`[[ -f "$score_file" ]] && local_criteria=$(jq -r '.criteria' "$score_file")`
			`fi`
			`[[ -n "$local_criteria" ]] \|\| continue`

			`echo "--- $template ---"`

			`# Get criterion names from baseline`
			`criteria_names=$(jq -r --arg t "$template" '.evals[$t].criteria \| keys[]' "$BASELINE_FILE")`

			`for crit in $criteria_names; do`
			`crit_expected=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].expected' "$BASELINE_FILE")`
			`crit_min=$(jq -r --arg t "$template" --arg c "$crit" '.evals[$t].criteria[$c].min' "$BASELINE_FILE")`

			`# Get actual score`
			`if [[ "$NUM_RUNS" -gt 1 ]]; then`
			`actual=$(jq -r --arg c "$crit" '.criteria[$c].avg // 0' "$agg_file")`
			`stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev // 0' "$agg_file")`
			`actual_int=$(echo "$actual" \| awk '{printf "%d", $1}')`
			`score_display="${actual} (stddev=${stddev})"`
			`else`
			`actual=$(echo "$local_criteria" \| jq -r --arg c "$crit" '.[$c] // 0')`
			`actual_int="$actual"`
			`score_display="$actual"`
			`fi`

			`if [[ "$actual_int" -lt "$crit_min" ]]; then`
			`status="FAIL"`
			`CRITERION_FAILURES=$((CRITERION_FAILURES + 1))`
			`elif [[ "$actual_int" -lt "$crit_expected" ]]; then`
			`status="WARN"`
			`else`
			`status="PASS"`
			`fi`

			`printf " %-40s %12s expected=%-3s min=%-3s %s\n" "$crit" "$score_display" "$crit_expected" "$crit_min" "$status"`
			`done`
			`echo ""`
			`done`

			`# --- Flaky criteria report (multi-run only) ---`

			`if [[ "$NUM_RUNS" -gt 1 ]]; then`
			`echo "=== Variance Report ==="`
			`echo ""`
			`any_flaky=0`

			`for template in "${TEMPLATES[@]}"; do`
			`result_dir="${RESULT_DIRS[$template]}"`
			`agg_file="$result_dir/skill.aggregate.json"`
			`[[ -f "$agg_file" ]] \|\| continue`

			`flaky=$(jq -r '.flaky_criteria // [] \| .[]' "$agg_file" 2>/dev/null)`
			`if [[ -n "$flaky" ]]; then`
			`any_flaky=1`
			`echo " $template:"`
			`for crit in $flaky; do`
			`scores=$(jq -r --arg c "$crit" '.criteria[$c].scores \| map(tostring) \| join(", ")' "$agg_file")`
			`stddev=$(jq -r --arg c "$crit" '.criteria[$c].stddev' "$agg_file")`
			`echo " $crit: [$scores] stddev=$stddev"`
			`done`
			`fi`
			`done`

			`if [[ "$any_flaky" -eq 0 ]]; then`
			`echo " No flaky criteria detected across $NUM_RUNS runs."`
			`fi`
			`echo ""`
			`fi`

			`# --- Write summary for CI ---`

			`SUMMARY_FILE="$EVAL_DIR/results/regression-summary.json"`
			`mkdir -p "$(dirname "$SUMMARY_FILE")"`

			`{`
			`echo "{"`
			`echo " \"timestamp\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","`
			`echo " \"runs_per_eval\": $NUM_RUNS,"`
			`echo " \"passed\": $([ $FAILED -eq 0 ] && [ $CRITERION_FAILURES -eq 0 ] && echo true \|\| echo false),"`
			`echo " \"total_regressions\": $FAILED,"`
			`echo " \"criterion_regressions\": $CRITERION_FAILURES,"`
			`echo " \"results\": {"`
			`first=1`
			`for template in "${TEMPLATES[@]}"; do`
			`[ $first -eq 0 ] && echo ","`
			`first=0`
			`score="${SCORES[$template]}"`
			`min=$(jq -r --arg t "$template" '.evals[$t].min' "$BASELINE_FILE")`
			`expected=$(jq -r --arg t "$template" '.evals[$t].expected' "$BASELINE_FILE")`
			`printf ' "%s": { "score": %s, "min": %s, "expected": %s }' "$template" "$score" "$min" "$expected"`
			`done`
			`echo ""`
			`echo " }"`
			`echo "}"`
			`} > "$SUMMARY_FILE"`

			`echo "Summary: $SUMMARY_FILE"`

			`if [[ $FAILED -eq 1 \|\| $CRITERION_FAILURES -gt 0 ]]; then`
			`echo ""`
			`if [[ $FAILED -eq 1 ]]; then`
			`echo "REGRESSION DETECTED: Total score below minimum threshold."`
			`fi`
			`if [[ $CRITERION_FAILURES -gt 0 ]]; then`
			`echo "CRITERION REGRESSION: $CRITERION_FAILURES criterion(s) below minimum threshold."`
			`fi`
			`exit 1`
			`fi`

			`echo ""`
			`echo "All evals passed regression check."`
			`exit 0`