codeflash-agent/evals/run-eval.sh

#!/bin/bash
set -euo pipefail

# Eval runner for codeflash-agent plugin
# Usage:
#   ./run-eval.sh <template>                    # run both skill + baseline
#   ./run-eval.sh <template> --skill-only       # run with-skill only
#   ./run-eval.sh <template> --baseline-only    # run baseline only
#   ./run-eval.sh <template> --runs 3           # run 3 times, aggregate
#   ./run-eval.sh --list                        # list available templates

EVAL_DIR="$(cd "$(dirname "$0")" && pwd)"
TEMPLATES_DIR="$EVAL_DIR/templates"
REPOS_DIR="$EVAL_DIR/repos"
RESULTS_DIR="$EVAL_DIR/results"
REPO_ROOT="$(cd "$EVAL_DIR/.." && pwd)"
PLUGIN_DIR="$REPO_ROOT"

# --- Helpers ---

die() { echo "ERROR: $*" >&2; exit 1; }

# Find eval directory (templates/ or repos/)
find_eval_dir() {
    local name=$1
    if [ -d "$TEMPLATES_DIR/$name" ]; then
        echo "$TEMPLATES_DIR/$name"
    elif [ -d "$REPOS_DIR/$name" ]; then
        echo "$REPOS_DIR/$name"
    else
        echo ""
    fi
}

list_templates() {
    echo "Available evals:"
    echo ""
    echo "  Templates (v1 — bundled source):"
    for d in "$TEMPLATES_DIR"/*/; do
        [ -d "$d" ] || continue
        name=$(basename "$d")
        desc=$(jq -r '.description // "no description"' "$d/manifest.json" 2>/dev/null || echo "no manifest")
        printf "    %-30s %s\n" "$name" "$desc"
    done
    echo ""
    echo "  Repos (v2 — cloned from git):"
    if [ -d "$REPOS_DIR" ]; then
        for d in "$REPOS_DIR"/*/; do
            [ -d "$d" ] || continue
            name=$(basename "$d")
            repo=$(jq -r '.repo // "?"' "$d/manifest.json" 2>/dev/null)
            desc=$(jq -r '.description // "no description"' "$d/manifest.json" 2>/dev/null || echo "no manifest")
            printf "    %-30s %s (%s)\n" "$name" "$desc" "$repo"
        done
    else
        echo "    (none)"
    fi
}

setup_workspace_v1() {
    local template=$1 label=$2
    local workdir
    workdir=$(mktemp -d)/eval-${label}-${template}
    cp -r "$TEMPLATES_DIR/$template" "$workdir"

    # Remove any leftover .venv / cache
    rm -rf "$workdir/.venv" "$workdir/.pytest_cache" "$workdir/__pycache__"

    # Install deps
    (cd "$workdir" && uv sync --quiet 2>/dev/null)

    echo "$workdir"
}

setup_workspace_v2() {
    local eval_name=$1 label=$2 manifest=$3
    local repo commit cache_dir workdir
    repo=$(jq -r '.repo' "$manifest")
    commit=$(jq -r '.commit' "$manifest")
    cache_dir="$(dirname "$manifest")/workspace"
    workdir=$(mktemp -d)/eval-${label}-${eval_name}

    # Clone to cache dir if not already present
    if [ ! -d "$cache_dir/.git" ]; then
        echo "Cloning $repo at $commit (first run — caching to $cache_dir)..." >&2
        gh repo clone "$repo" "$cache_dir" -- --quiet --no-checkout --depth 1
        (cd "$cache_dir" \
            && git fetch --quiet --depth 1 origin "$commit" \
            && git checkout --quiet FETCH_HEAD)
    else
        echo "Using cached clone from $cache_dir" >&2
    fi

    # Copy cached workspace to isolated temp dir for this run
    cp -r "$cache_dir" "$workdir"

    echo "$workdir"
}

build_prompt() {
    local manifest=$1
    local version
    version=$(jq -r '.version // 1' "$manifest")

    # v2: prompt is in the manifest
    if [ "$version" -ge 2 ]; then
        jq -r '.prompt' "$manifest"
        return
    fi

    # v1: derive prompt from eval_type
    local eval_type
    eval_type=$(jq -r '.eval_type' "$manifest")

    case "$eval_type" in
        crossdomain)
            echo "The tests in this project are too slow. Profile the code, identify the performance bottlenecks in each module, and optimize them. Run the tests before and after to verify correctness and measure speedup."
            ;;
        layered)
            echo "test_process_large_batch is too slow and uses too much memory. Profile and optimize process_records. Run the test before and after to verify correctness and measure improvement."
            ;;
        ranking|ranking-hard)
            echo "test_large_batch in test_pipeline.py is too slow. Profile run_pipeline, find what's slow, and optimize. Run the test before and after to verify correctness and measure speedup."
            ;;
        memory)
            echo "process_dataset in test_large_dataset uses too much memory — peak RSS is 3x higher than the input data size. Profile memory usage, find what's allocating the most, and reduce peak memory. Run the test before and after to verify correctness."
            ;;
        memory-hard)
            echo "process_readings in test_large_dataset uses too much memory — peak is 3x higher than the input data. Profile memory usage step by step, find the dominant allocator, and reduce peak memory. Run the test before and after to verify correctness."
            ;;
        memory-balanced)
            echo "process_orders in test_large_batch uses too much memory — peak is over 2x the input data. Profile memory at each pipeline stage, find what's adding overhead, and reduce it. Run the test before and after to verify correctness."
            ;;
        memory-misdirection)
            echo "process_transactions in test_large_batch uses too much memory — peak is nearly 3x the input data. Profile memory usage, identify the biggest allocators in order of impact, and fix them. Run the test before and after to verify correctness."
            ;;
        *)
            echo "The tests in this project are too slow. Profile the code, find the performance bottlenecks, and optimize them. Run the tests before and after to verify correctness and measure improvement."
            ;;
    esac
}

run_claude() {
    local workdir=$1 label=$2 result_prefix=$3 use_plugin=$4
    local start_time end_time duration

    echo "=== Running $label in $workdir ==="

    start_time=$(date +%s)

    local prompt
    prompt=$(cat "$workdir/.eval-prompt")

    # When running with plugin, instruct Claude to invoke the skill
    if [ "$use_plugin" = "true" ]; then
        prompt="IMPORTANT: You MUST use the Skill tool to invoke '/codeflash-optimize' for this task. Do NOT solve the problem directly — delegate to the skill first. This is a non-interactive eval — there is no user to respond. When the skill tells you to ask the user a question, DO NOT ask and DO NOT stop. Instead, immediately launch the codeflash agent with the task below. No constraints apply. Task: $prompt"
    fi

    local claude_args=(-p "$prompt" --output-format json --verbose)

    if [ "$use_plugin" = "true" ]; then
        claude_args+=(--plugin-dir "$PLUGIN_DIR")
    fi

    # Run claude in the workspace dir
    # --dangerously-skip-permissions: evals run in temp dirs, safe to allow all tools
    local exit_code=0
    (cd "$workdir" && claude "${claude_args[@]}" --dangerously-skip-permissions) \
        > "${result_prefix}.json" 2> "${result_prefix}.stderr" || exit_code=$?

    end_time=$(date +%s)
    duration=$((end_time - start_time))

    echo "$exit_code" > "${result_prefix}.exitcode"
    echo "$label completed in ${duration}s (exit code: $exit_code)"
    echo "$duration" > "${result_prefix}.duration"

    # Detect empty/missing output — likely a crash or timeout
    if [ ! -s "${result_prefix}.json" ]; then
        echo "WARNING: $label produced no output (exit code: $exit_code)" >&2
        echo "CRASH DIAGNOSTIC: claude exited with code $exit_code after ${duration}s. No JSON output produced." \
            >> "${result_prefix}.stderr"
        if [ "$exit_code" -eq 124 ] || [ "$duration" -gt 1800 ]; then
            echo "LIKELY CAUSE: timeout (duration=${duration}s)" >> "${result_prefix}.stderr"
        elif [ "$exit_code" -eq 137 ] || [ "$exit_code" -eq 139 ]; then
            echo "LIKELY CAUSE: OOM or signal kill (exit code=$exit_code)" >> "${result_prefix}.stderr"
        fi
    fi

    # Run tests post-optimization to check correctness + timing
    local test_cmd
    test_cmd=$(jq -r '.test_command // empty' "$RUN_DIR/manifest.json")
    if [ -n "$test_cmd" ]; then
        echo "Running post-optimization tests..."
        (cd "$workdir" && eval "$test_cmd") > "${result_prefix}.tests" 2>&1 || true
    else
        echo "No test_command in manifest — skipping post-optimization tests"
        touch "${result_prefix}.tests"
    fi
}

run_single() {
    # Run a single eval iteration into $RUN_DIR
    local eval_name=$1 mode=$2 manifest=$3 prompt=$4 version=$5

    echo "$PROMPT" > /dev/null  # ensure PROMPT is available

    # --- With-skill run ---
    if [ "$mode" != "--baseline-only" ]; then
        local skill_workdir
        if [ "$version" -ge 2 ]; then
            skill_workdir=$(setup_workspace_v2 "$eval_name" "skill" "$manifest")
        else
            skill_workdir=$(setup_workspace_v1 "$eval_name" "skill")
        fi
        echo "$prompt" > "$skill_workdir/.eval-prompt"
        run_claude "$skill_workdir" "with-skill" "$RUN_DIR/skill" "true"
        echo ""
    fi

    # --- Baseline run ---
    if [ "$mode" != "--skill-only" ]; then
        local baseline_workdir
        if [ "$version" -ge 2 ]; then
            baseline_workdir=$(setup_workspace_v2 "$eval_name" "baseline" "$manifest")
        else
            baseline_workdir=$(setup_workspace_v1 "$eval_name" "baseline")
        fi
        echo "$prompt" > "$baseline_workdir/.eval-prompt"
        run_claude "$baseline_workdir" "baseline" "$RUN_DIR/baseline" "false"
        echo ""
    fi
}

# --- Main ---

if [ "${1:-}" = "--list" ]; then
    list_templates
    exit 0
fi

# --- Parse args ---

eval_name=""
mode="--both"
num_runs=1

while [[ $# -gt 0 ]]; do
    case "$1" in
        --skill-only|--baseline-only)
            mode="$1"
            shift
            ;;
        --runs)
            num_runs="${2:?--runs requires a number}"
            shift 2
            ;;
        -*)
            die "Unknown flag: $1"
            ;;
        *)
            eval_name="$1"
            shift
            ;;
    esac
done

[[ -n "$eval_name" ]] || die "Usage: $0 <eval-name> [--skill-only|--baseline-only] [--runs N]"

EVAL_SOURCE_DIR=$(find_eval_dir "$eval_name")
[ -n "$EVAL_SOURCE_DIR" ] || die "Eval not found: $eval_name. Use --list to see available evals."

MANIFEST="$EVAL_SOURCE_DIR/manifest.json"
[ -f "$MANIFEST" ] || die "No manifest.json in: $EVAL_SOURCE_DIR"

VERSION=$(jq -r '.version // 1' "$MANIFEST")

TIMESTAMP=$(date +%Y%m%d-%H%M%S)

# Build prompt
PROMPT=$(build_prompt "$MANIFEST")

if [ "$num_runs" -gt 1 ]; then
    # Multi-run mode: create parent dir with run-1/, run-2/, etc.
    PARENT_DIR="$RESULTS_DIR/${eval_name}-${TIMESTAMP}-${num_runs}runs"
    mkdir -p "$PARENT_DIR"
    cp "$MANIFEST" "$PARENT_DIR/manifest.json"

    echo "Eval: $eval_name (v$VERSION) — $num_runs runs"
    echo "Results: $PARENT_DIR"
    echo "Prompt: $PROMPT"
    echo ""

    for i in $(seq 1 "$num_runs"); do
        echo "========================================"
        echo "  Run $i / $num_runs"
        echo "========================================"

        RUN_DIR="$PARENT_DIR/run-$i"
        mkdir -p "$RUN_DIR"
        cp "$MANIFEST" "$RUN_DIR/manifest.json"

        run_single "$eval_name" "$mode" "$MANIFEST" "$PROMPT" "$VERSION"

        # Score this run immediately
        "$EVAL_DIR/score-eval.sh" "$RUN_DIR"
        echo ""
    done

    # Aggregate across runs
    echo "========================================"
    echo "  Aggregating $num_runs runs"
    echo "========================================"
    python3 "$EVAL_DIR/score.py" aggregate "$PARENT_DIR"

    echo "=== Results ==="
    echo "Directory: $PARENT_DIR"
    echo "Runs: $num_runs"
    echo ""
    echo "Files:"
    ls -1 "$PARENT_DIR/"
else
    # Single-run mode (original behavior)
    RUN_DIR="$RESULTS_DIR/${eval_name}-${TIMESTAMP}"
    mkdir -p "$RUN_DIR"
    cp "$MANIFEST" "$RUN_DIR/manifest.json"

    echo "Eval: $eval_name (v$VERSION)"
    echo "Results: $RUN_DIR"
    echo "Prompt: $PROMPT"
    echo ""

    run_single "$eval_name" "$mode" "$MANIFEST" "$PROMPT" "$VERSION"

    echo "=== Results ==="
    echo "Directory: $RUN_DIR"
    echo ""

    if [ -f "$RUN_DIR/skill.duration" ] && [ -f "$RUN_DIR/baseline.duration" ]; then
        skill_dur=$(cat "$RUN_DIR/skill.duration")
        baseline_dur=$(cat "$RUN_DIR/baseline.duration")
        echo "With-skill duration: ${skill_dur}s"
        echo "Baseline duration:   ${baseline_dur}s"
    fi

    echo ""
    echo "Files:"
    ls -1 "$RUN_DIR/"
    echo ""
    echo "Next step: run ./score-eval.sh $RUN_DIR to score the results"
fi