mirror of
https://github.com/codeflash-ai/codeflash-internal.git
synced 2026-05-04 18:25:18 +00:00
26 lines
1.5 KiB
JSON
26 lines
1.5 KiB
JSON
{
|
|
"context": "Tests whether the agent understands the model distribution formula, LLM cost calculation differences between OpenAI and Anthropic, and the MAX_OPTIMIZER_CALLS constant.",
|
|
"type": "weighted_checklist",
|
|
"checklist": [
|
|
{
|
|
"name": "Distribution formula correct",
|
|
"description": "get_model_distribution uses claude_calls = (total - 1) // 2 and gpt_calls = total - claude_calls, and total = min(n_candidates, max_calls). For n_candidates=5, max_calls=6: returns 3 OpenAI + 2 Anthropic. For n_candidates=6, max_calls=6: returns 4 OpenAI + 2 Anthropic.",
|
|
"max_score": 30
|
|
},
|
|
{
|
|
"name": "OpenAI cached token accounting",
|
|
"description": "For OpenAI, cached_input_tokens is treated as a subset of input_tokens. Non-cached input = input_tokens - cached_input_tokens. Cost uses both rates correctly with GPT-5-mini pricing ($0.25 input, $0.03 cached, $2.00 output per 1M tokens).",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Anthropic cached token accounting",
|
|
"description": "For Anthropic, cached_input_tokens is additive to input_tokens (separate count). Cost uses Claude Sonnet 4.5 pricing ($3.00 input, $15.00 output per 1M tokens).",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Full run estimation uses MAX_OPTIMIZER_CALLS=6",
|
|
"description": "estimate_full_run_cost uses MAX_OPTIMIZER_CALLS = 6 as the max_calls parameter and correctly multiplies per-call costs by the call count for each provider.",
|
|
"max_score": 20
|
|
}
|
|
]
|
|
}
|