codeflash-internal/tiles/codeflash-internal-docs/evals/scenario-1/criteria.json

{
  "context": "Tests whether the agent understands the model distribution formula, LLM cost calculation differences between OpenAI and Anthropic, and the MAX_OPTIMIZER_CALLS constant.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Distribution formula correct",
      "description": "get_model_distribution uses claude_calls = (total - 1) // 2 and gpt_calls = total - claude_calls, and total = min(n_candidates, max_calls). For n_candidates=5, max_calls=6: returns 3 OpenAI + 2 Anthropic. For n_candidates=6, max_calls=6: returns 4 OpenAI + 2 Anthropic.",
      "max_score": 30
    },
    {
      "name": "OpenAI cached token accounting",
      "description": "For OpenAI, cached_input_tokens is treated as a subset of input_tokens. Non-cached input = input_tokens - cached_input_tokens. Cost uses both rates correctly with GPT-5-mini pricing ($0.25 input, $0.03 cached, $2.00 output per 1M tokens).",
      "max_score": 25
    },
    {
      "name": "Anthropic cached token accounting",
      "description": "For Anthropic, cached_input_tokens is additive to input_tokens (separate count). Cost uses Claude Sonnet 4.5 pricing ($3.00 input, $15.00 output per 1M tokens).",
      "max_score": 25
    },
    {
      "name": "Full run estimation uses MAX_OPTIMIZER_CALLS=6",
      "description": "estimate_full_run_cost uses MAX_OPTIMIZER_CALLS = 6 as the max_calls parameter and correctly multiplies per-call costs by the call count for each provider.",
      "max_score": 20
    }
  ]
}