codeflash-internal/tiles/codeflash-internal-skills/evals/scenario-2/criteria.json
2026-02-14 22:25:30 -05:00

26 lines
1.2 KiB
JSON

{
"context": "Tests ability to diagnose postprocessing as the stage that removed all candidates, and to use logging data to confirm the diagnosis",
"type": "weighted_checklist",
"checklist": [
{
"name": "Identifies postprocessing stage",
"description": "Correctly identifies the postprocessing stage (Step 5) in core/languages/python/optimizer/postprocess.py as where candidates are lost",
"max_score": 25
},
{
"name": "Explains both postprocessing checks",
"description": "Explains that deduplicate_optimizations() removes AST-identical candidates via ast.parse()/ast.dump(), and equality_check() removes candidates identical to the original code",
"max_score": 25
},
{
"name": "Uses logging to confirm diagnosis",
"description": "References the logging table (optimizations_raw vs optimizations_post in core/log_features/models.py) to confirm that candidates existed before postprocessing but were all filtered out",
"max_score": 25
},
{
"name": "Provides actionable recommendation",
"description": "Recommends increasing n_candidates, improving prompt quality, or checking that the function is non-trivial enough to optimize",
"max_score": 25
}
]
}