codeflash/tiles/codeflash-docs/evals/scenario-2/criteria.json

{
  "context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Lists source types",
      "description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE",
      "max_score": 25
    },
    {
      "name": "Parent ID linkage",
      "description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates",
      "max_score": 25
    },
    {
      "name": "Refinement uses runtime data",
      "description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code",
      "max_score": 25
    },
    {
      "name": "Repair uses test diffs",
      "description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages",
      "max_score": 25
    }
  ]
}