codeflash/tiles/codeflash-skills/evals/scenario-1/criteria.json

{
  "context": "Tests whether the agent follows the sequential debugging workflow from the skill, checking pipeline stages in order and using correct threshold values when diagnosing an optimization that produced no results.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Sequential stage order",
      "description": "Investigates pipeline stages in order: discovery before ranking before context before AI service before test failures. Does NOT jump to later stages without checking earlier ones first.",
      "max_score": 25
    },
    {
      "name": "Token limit value",
      "description": "References the specific token limit of 16000 for OPTIMIZATION_CONTEXT_TOKEN_LIMIT or TESTGEN_CONTEXT_TOKEN_LIMIT when checking context extraction",
      "max_score": 25
    },
    {
      "name": "Importance threshold",
      "description": "References DEFAULT_IMPORTANCE_THRESHOLD=0.001 when checking function ranking",
      "max_score": 25
    },
    {
      "name": "Stops at failure",
      "description": "Identifies the failing stage and focuses investigation there rather than continuing through all remaining stages",
      "max_score": 25
    }
  ]
}