codeflash-internal/tiles/codeflash-internal-skills/evals/scenario-5/criteria.json
2026-02-14 22:25:30 -05:00

26 lines
1.3 KiB
JSON

{
"context": "Tests ability to diagnose test generation instrumentation failures where GPU sync injection introduces syntax errors",
"type": "weighted_checklist",
"checklist": [
{
"name": "Identifies instrumentation stage",
"description": "Correctly identifies the instrumentation stage (Step 7) in core/languages/python/testgen/instrumentation/instrument_new_tests.py as the failure point, not postprocessing or LLM response",
"max_score": 25
},
{
"name": "Explains framework detection and sync injection",
"description": "Explains that detect_frameworks_from_code() found PyTorch, which triggered _create_device_sync_precompute_statements() to inject torch.cuda.synchronize() calls for GPU timing accuracy",
"max_score": 25
},
{
"name": "Diagnoses the syntax error cause",
"description": "Identifies that the sync statement was injected without proper newline/whitespace separation, causing it to concatenate with an existing line (torch.cuda.synchronize()import torch)",
"max_score": 30
},
{
"name": "Recommends fix approach",
"description": "Recommends fixing the injection logic to add proper newlines and suggests adding a post-instrumentation compilation check to catch such errors before returning",
"max_score": 20
}
]
}