codeflash-internal/tiles/codeflash-internal-skills/evals/scenario-5/criteria.json

{
  "context": "Tests ability to diagnose test generation instrumentation failures where GPU sync injection introduces syntax errors",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Identifies instrumentation stage",
      "description": "Correctly identifies the instrumentation stage (Step 7) in core/languages/python/testgen/instrumentation/instrument_new_tests.py as the failure point, not postprocessing or LLM response",
      "max_score": 25
    },
    {
      "name": "Explains framework detection and sync injection",
      "description": "Explains that detect_frameworks_from_code() found PyTorch, which triggered _create_device_sync_precompute_statements() to inject torch.cuda.synchronize() calls for GPU timing accuracy",
      "max_score": 25
    },
    {
      "name": "Diagnoses the syntax error cause",
      "description": "Identifies that the sync statement was injected without proper newline/whitespace separation, causing it to concatenate with an existing line (torch.cuda.synchronize()import torch)",
      "max_score": 30
    },
    {
      "name": "Recommends fix approach",
      "description": "Recommends fixing the injection logic to add proper newlines and suggests adding a post-instrumentation compilation check to catch such errors before returning",
      "max_score": 20
    }
  ]
}