codeflash/tiles/codeflash-docs/evals/scenario-4/criteria.json
Kevin Turcios 869fbe1766 chore: add eval scenarios for codeflash-docs tile
5 scenarios testing: code serialization format, candidate lifecycle/DAG,
deterministic patches, effort levels/selection criteria, and function
representation/concurrency model.
2026-02-14 21:29:22 -05:00

26 lines
1 KiB
JSON

{
"context": "Tests whether the agent knows the specific effort level values for candidate generation and the criteria used to select the best optimization candidate.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Candidate counts by effort",
"description": "States correct N_OPTIMIZER_CANDIDATES values: LOW=3, MEDIUM=5, HIGH=6 (at least 2 of 3 correct)",
"max_score": 25
},
{
"name": "Speedup as primary selector",
"description": "States that the winning candidate is selected primarily by highest speedup ratio",
"max_score": 25
},
{
"name": "Diff length as tiebreaker",
"description": "States that for tied speedups, shortest diff length from original is used as tiebreaker",
"max_score": 25
},
{
"name": "Refinement ranking weights",
"description": "States that refinement candidates use weighted ranking with runtime weighted more heavily than diff (2:1 ratio or REFINED_CANDIDATE_RANKING_WEIGHTS=(2,1))",
"max_score": 25
}
]
}