- Optimize codeflash-optimize SKILL.md (review score 17% → 98%, eval 87% → 100%) - Fix frontmatter (allowed-tools format, argument-hint under metadata) - Lead description with concrete actions, explicit agent launch parameters - Add multi-run variance detection to eval system (--runs N flag) - score.py aggregate command: min/max/avg/stddev per criterion, flaky detection - check-regression.sh defaults to 3 runs for reliable regression detection - Add per-criterion regression tracking to baseline-scores.json (v3) - Reports exactly which criteria regressed, not just total score drops - Rename evals/ → codeflash-evals/ to avoid tessl directory conflicts - Switch tessl to managed mode, gitignore vendored tiles and symlinks
107 lines
3.3 KiB
YAML
107 lines
3.3 KiB
YAML
name: Eval Regression
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
inputs:
|
|
templates:
|
|
description: 'Comma-separated eval templates (blank = all baseline evals)'
|
|
required: false
|
|
default: ''
|
|
|
|
jobs:
|
|
eval:
|
|
runs-on: ubuntu-latest
|
|
permissions:
|
|
contents: read
|
|
id-token: write
|
|
timeout-minutes: 30
|
|
steps:
|
|
- name: Checkout repository
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Configure AWS Credentials
|
|
uses: aws-actions/configure-aws-credentials@v4
|
|
with:
|
|
role-to-assume: ${{ secrets.AWS_ROLE_TO_ASSUME }}
|
|
aws-region: ${{ secrets.AWS_REGION }}
|
|
|
|
- name: Install uv
|
|
uses: astral-sh/setup-uv@v6
|
|
|
|
- name: Install Claude Code
|
|
run: npm install -g @anthropic-ai/claude-code
|
|
|
|
- name: Configure Claude for Bedrock
|
|
run: |
|
|
mkdir -p ~/.claude
|
|
cat > ~/.claude/settings.json << 'EOF'
|
|
{
|
|
"permissions": {
|
|
"allow": ["Bash", "Read", "Write", "Edit", "Glob", "Grep", "Agent", "Skill"],
|
|
"deny": []
|
|
}
|
|
}
|
|
EOF
|
|
|
|
- name: Run regression check
|
|
env:
|
|
ANTHROPIC_MODEL: us.anthropic.claude-sonnet-4-6
|
|
CLAUDE_CODE_USE_BEDROCK: 1
|
|
run: |
|
|
chmod +x codeflash-evals/check-regression.sh codeflash-evals/run-eval.sh codeflash-evals/score-eval.sh
|
|
|
|
ARGS=()
|
|
if [ -n "${{ inputs.templates }}" ]; then
|
|
IFS=',' read -ra TMPLS <<< "${{ inputs.templates }}"
|
|
for t in "${TMPLS[@]}"; do
|
|
ARGS+=("$(echo "$t" | xargs)")
|
|
done
|
|
fi
|
|
|
|
./codeflash-evals/check-regression.sh "${ARGS[@]}"
|
|
|
|
- name: Upload results
|
|
if: always()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: eval-results-${{ github.run_number }}
|
|
path: codeflash-evals/results/
|
|
retention-days: 30
|
|
|
|
- name: Post job summary
|
|
if: always()
|
|
run: |
|
|
SUMMARY="codeflash-evals/results/regression-summary.json"
|
|
if [ ! -f "$SUMMARY" ]; then
|
|
echo "::warning::No regression summary found"
|
|
exit 0
|
|
fi
|
|
|
|
passed=$(jq -r '.passed' "$SUMMARY")
|
|
echo "## Eval Regression Results" >> $GITHUB_STEP_SUMMARY
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
|
|
if [ "$passed" = "true" ]; then
|
|
echo "**Status: PASSED**" >> $GITHUB_STEP_SUMMARY
|
|
else
|
|
echo "**Status: FAILED**" >> $GITHUB_STEP_SUMMARY
|
|
fi
|
|
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "| Template | Score | Min | Expected | Status |" >> $GITHUB_STEP_SUMMARY
|
|
echo "|----------|-------|-----|----------|--------|" >> $GITHUB_STEP_SUMMARY
|
|
|
|
jq -r '.results | to_entries[] | "\(.key)\t\(.value.score)\t\(.value.min)\t\(.value.expected)"' "$SUMMARY" | \
|
|
while IFS=$'\t' read -r template score min expected; do
|
|
if [ "$score" -lt "$min" ]; then
|
|
status="FAIL"
|
|
elif [ "$score" -lt "$expected" ]; then
|
|
status="WARN"
|
|
else
|
|
status="PASS"
|
|
fi
|
|
echo "| $template | $score | $min | $expected | $status |" >> $GITHUB_STEP_SUMMARY
|
|
done
|
|
|
|
echo "" >> $GITHUB_STEP_SUMMARY
|
|
echo "*Triggered at $(jq -r '.timestamp' "$SUMMARY")*" >> $GITHUB_STEP_SUMMARY
|