codeflash/tiles/codeflash-skills/evals/scenario-5/criteria.json

{
  "context": "Tests whether the agent understands codeflash's candidate deduplication via AST normalization and the specific conditions under which code repair is triggered vs skipped.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "AST normalization",
      "description": "Mentions that deduplication uses AST normalization (normalize_code from code_utils/deduplicate_code.py), NOT simple string comparison",
      "max_score": 25
    },
    {
      "name": "Duplicate result copying",
      "description": "Explains that duplicate candidates copy results from the first-seen candidate rather than being re-tested",
      "max_score": 25
    },
    {
      "name": "Repair trigger threshold",
      "description": "States that repair triggers when fewer than 2 candidates pass (MIN_CORRECT_CANDIDATES=2), NOT when zero candidates pass or when any candidate fails",
      "max_score": 25
    },
    {
      "name": "Unmatched percentage limit",
      "description": "Mentions REPAIR_UNMATCHED_PERCENTAGE_LIMIT as a condition that can cause repair to be skipped entirely, with effort-dependent values (0.2/0.3/0.4)",
      "max_score": 25
    }
  ]
}