codeflash-internal/tiles/codeflash-internal-skills/evals/summary.json
2026-02-14 22:25:30 -05:00

47 lines
2.5 KiB
JSON

{
"package_name": "codeflash-internal-skills",
"total_scenarios": 5,
"total_capabilities_tested": 13,
"scenarios": [
{
"id": "scenario-1",
"title": "Optimization request for unsupported language (rust) returns error",
"skills_tested": ["debug-optimization-failure"],
"capabilities_tested": [0, 1],
"complexity": "basic",
"description": "A request with language='rust' fails at router dispatch because there is no handler for Rust. Tests ability to trace the failure to the dispatch stage and recommend either schema validation or new language support."
},
{
"id": "scenario-2",
"title": "All optimization candidates removed by postprocessing",
"skills_tested": ["debug-optimization-failure"],
"capabilities_tested": [3, 4],
"complexity": "intermediate",
"description": "A valid Python optimization request produces 5 LLM candidates, but all are removed by deduplication and equality checks. Tests ability to diagnose postprocessing as the failure stage and use logging data to confirm."
},
{
"id": "scenario-3",
"title": "Add Go language support to the optimization system",
"skills_tested": ["add-language-support"],
"capabilities_tested": [5, 6, 7],
"complexity": "intermediate",
"description": "Implement Go as a new language with optimizer support only. Tests ability to follow the 7-step add-language-support workflow: directory structure, handler class with @register_handler, router dispatch with lazy imports, and test plan."
},
{
"id": "scenario-4",
"title": "Add code-complexity endpoint to aiservice and cf-api",
"skills_tested": ["add-api-endpoint"],
"capabilities_tested": [8, 9, 10],
"complexity": "advanced",
"description": "Create a new endpoint in both Django-Ninja (aiservice) and Express (cf-api). Tests schemas, async handlers, AuthenticatedRequest, URL registration, and correct middleware ordering in Express route registration."
},
{
"id": "scenario-5",
"title": "Instrumented PyTorch tests fail to compile",
"skills_tested": ["debug-test-generation"],
"capabilities_tested": [11, 12],
"complexity": "advanced",
"description": "Test generation succeeds but instrumentation injects torch.cuda.synchronize() without proper newlines, creating a syntax error. Tests ability to trace the failure to the instrumentation stage and identify the GPU sync injection bug."
}
]
}