codeflash/tiles/codeflash-docs/evals/scenario-2/criteria.json
Kevin Turcios 869fbe1766 chore: add eval scenarios for codeflash-docs tile
5 scenarios testing: code serialization format, candidate lifecycle/DAG,
deterministic patches, effort levels/selection criteria, and function
representation/concurrency model.
2026-02-14 21:29:22 -05:00

26 lines
1.1 KiB
JSON

{
"context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Lists source types",
"description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE",
"max_score": 25
},
{
"name": "Parent ID linkage",
"description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates",
"max_score": 25
},
{
"name": "Refinement uses runtime data",
"description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code",
"max_score": 25
},
{
"name": "Repair uses test diffs",
"description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages",
"max_score": 25
}
]
}