codeflash-internal/tiles/codeflash-internal-docs/evals/scenario-5/criteria.json

{
  "context": "Tests whether the agent understands the LLM provider abstraction (dataclass, client setup, provider-specific handling), observability patterns, and test generation framework detection from the llm-provider-abstraction and test-generation-pipeline docs.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "LLM dataclass and model instances",
      "description": "LLM is a pydantic_dataclass with all 6 fields (name, max_tokens, model_type as Literal, input_cost, cached_input_cost, output_cost). Concrete instances use correct pricing from the docs.",
      "max_score": 20
    },
    {
      "name": "Provider-specific call handling",
      "description": "call_llm dispatches by model_type. OpenAI uses client.chat.completions.create() with max_completion_tokens for GPT-5-mini and max_tokens for older models. Anthropic extracts system prompt from messages and passes via system= kwarg, and concatenates text blocks from the response.",
      "max_score": 30
    },
    {
      "name": "Observability in finally block",
      "description": "record_llm_call() is called in a finally block (not just after success), ensuring every LLM call is recorded to the database regardless of success or failure. Includes trace_id, call_type, model, cost, and latency.",
      "max_score": 25
    },
    {
      "name": "Framework detection",
      "description": "detect_frameworks_from_code parses import statements to identify PyTorch, TensorFlow, and JAX. Handles both 'import torch' and aliased imports like 'import tensorflow as tf'.",
      "max_score": 25
    }
  ]
}