codeflash-agent/.github/workflows/eval-regression.yml
Kevin Turcios 37efa524d7 feat: improve skill, eval system, and tessl config
- Optimize codeflash-optimize SKILL.md (review score 17% → 98%, eval 87% → 100%)
  - Fix frontmatter (allowed-tools format, argument-hint under metadata)
  - Lead description with concrete actions, explicit agent launch parameters
- Add multi-run variance detection to eval system (--runs N flag)
  - score.py aggregate command: min/max/avg/stddev per criterion, flaky detection
  - check-regression.sh defaults to 3 runs for reliable regression detection
- Add per-criterion regression tracking to baseline-scores.json (v3)
  - Reports exactly which criteria regressed, not just total score drops
- Rename evals/ → codeflash-evals/ to avoid tessl directory conflicts
- Switch tessl to managed mode, gitignore vendored tiles and symlinks
2026-03-27 11:30:17 -05:00

107 lines
3.3 KiB
YAML

name: Eval Regression
on:
workflow_dispatch:
inputs:
templates:
description: 'Comma-separated eval templates (blank = all baseline evals)'
required: false
default: ''
jobs:
eval:
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
timeout-minutes: 30
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v4
with:
role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
aws-region: ${{ secrets.AWS_REGION }}
- name: Install uv
uses: astral-sh/setup-uv@v6
- name: Install Claude Code
run: npm install -g @anthropic-ai/claude-code
- name: Configure Claude for Bedrock
run: |
mkdir -p ~/.claude
cat > ~/.claude/settings.json << 'EOF'
{
"permissions": {
"allow": ["Bash", "Read", "Write", "Edit", "Glob", "Grep", "Agent", "Skill"],
"deny": []
}
}
EOF
- name: Run regression check
env:
ANTHROPIC_MODEL: us.anthropic.claude-sonnet-4-6
CLAUDE_CODE_USE_BEDROCK: 1
run: |
chmod +x codeflash-evals/check-regression.sh codeflash-evals/run-eval.sh codeflash-evals/score-eval.sh
ARGS=()
if [ -n "${{ inputs.templates }}" ]; then
IFS=',' read -ra TMPLS <<< "${{ inputs.templates }}"
for t in "${TMPLS[@]}"; do
ARGS+=("$(echo "$t" | xargs)")
done
fi
./codeflash-evals/check-regression.sh "${ARGS[@]}"
- name: Upload results
if: always()
uses: actions/upload-artifact@v4
with:
name: eval-results-${{ github.run_number }}
path: codeflash-evals/results/
retention-days: 30
- name: Post job summary
if: always()
run: |
SUMMARY="codeflash-evals/results/regression-summary.json"
if [ ! -f "$SUMMARY" ]; then
echo "::warning::No regression summary found"
exit 0
fi
passed=$(jq -r '.passed' "$SUMMARY")
echo "## Eval Regression Results" >> $GITHUB_STEP_SUMMARY
echo "" >> $GITHUB_STEP_SUMMARY
if [ "$passed" = "true" ]; then
echo "**Status: PASSED**" >> $GITHUB_STEP_SUMMARY
else
echo "**Status: FAILED**" >> $GITHUB_STEP_SUMMARY
fi
echo "" >> $GITHUB_STEP_SUMMARY
echo "| Template | Score | Min | Expected | Status |" >> $GITHUB_STEP_SUMMARY
echo "|----------|-------|-----|----------|--------|" >> $GITHUB_STEP_SUMMARY
jq -r '.results | to_entries[] | "\(.key)\t\(.value.score)\t\(.value.min)\t\(.value.expected)"' "$SUMMARY" | \
while IFS=$'\t' read -r template score min expected; do
if [ "$score" -lt "$min" ]; then
status="FAIL"
elif [ "$score" -lt "$expected" ]; then
status="WARN"
else
status="PASS"
fi
echo "| $template | $score | $min | $expected | $status |" >> $GITHUB_STEP_SUMMARY
done
echo "" >> $GITHUB_STEP_SUMMARY
echo "*Triggered at $(jq -r '.timestamp' "$SUMMARY")*" >> $GITHUB_STEP_SUMMARY