- Optimize codeflash-optimize SKILL.md (review score 17% → 98%, eval 87% → 100%) - Fix frontmatter (allowed-tools format, argument-hint under metadata) - Lead description with concrete actions, explicit agent launch parameters - Add multi-run variance detection to eval system (--runs N flag) - score.py aggregate command: min/max/avg/stddev per criterion, flaky detection - check-regression.sh defaults to 3 runs for reliable regression detection - Add per-criterion regression tracking to baseline-scores.json (v3) - Reports exactly which criteria regressed, not just total score drops - Rename evals/ → codeflash-evals/ to avoid tessl directory conflicts - Switch tessl to managed mode, gitignore vendored tiles and symlinks
43 lines
1.5 KiB
JSON
43 lines
1.5 KiB
JSON
{
|
|
"version": 3,
|
|
"updated": "2026-03-27",
|
|
"note": "v3: per-criterion baselines for pinpointed regression detection",
|
|
"evals": {
|
|
"ranking": {
|
|
"expected": 9,
|
|
"min": 7,
|
|
"max": 10,
|
|
"criteria": {
|
|
"built_ranked_list_with_impact_pct": { "expected": 3, "min": 2 },
|
|
"fixed_highest_impact_first": { "expected": 2, "min": 1 },
|
|
"skipped_low_impact_targets": { "expected": 3, "min": 2 },
|
|
"reprofiled_after_major_fix": { "expected": 2, "min": 1 }
|
|
}
|
|
},
|
|
"memory-hard": {
|
|
"expected": 9,
|
|
"min": 7,
|
|
"max": 10,
|
|
"criteria": {
|
|
"used_memory_profiler": { "expected": 2, "min": 2 },
|
|
"profiled_per_stage": { "expected": 2, "min": 1 },
|
|
"identified_dominant_allocator": { "expected": 3, "min": 2 },
|
|
"fixed_dominant_issue": { "expected": 2, "min": 1 },
|
|
"fixed_secondary_issues": { "expected": 1, "min": 0 }
|
|
}
|
|
},
|
|
"memory-misdirection": {
|
|
"expected": 9,
|
|
"min": 7,
|
|
"max": 10,
|
|
"criteria": {
|
|
"used_memory_profiler": { "expected": 1, "min": 1 },
|
|
"profiled_iteratively": { "expected": 2, "min": 1 },
|
|
"identified_analytics_as_major": { "expected": 2, "min": 1 },
|
|
"fixed_analytics_details": { "expected": 2, "min": 1 },
|
|
"fixed_other_issues": { "expected": 2, "min": 1 },
|
|
"tests_pass": { "expected": 1, "min": 1 }
|
|
}
|
|
}
|
|
}
|
|
}
|