5 scenarios testing: sequential debugging, Result type + effort config, test patterns, domain type conventions, and deduplication/repair mechanics. Also adds tessl-labs/tessl-skill-eval-scenarios dev dependency.
26 lines
1.1 KiB
JSON
26 lines
1.1 KiB
JSON
{
|
|
"context": "Tests whether the agent understands codeflash's candidate deduplication via AST normalization and the specific conditions under which code repair is triggered vs skipped.",
|
|
"type": "weighted_checklist",
|
|
"checklist": [
|
|
{
|
|
"name": "AST normalization",
|
|
"description": "Mentions that deduplication uses AST normalization (normalize_code from code_utils/deduplicate_code.py), NOT simple string comparison",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Duplicate result copying",
|
|
"description": "Explains that duplicate candidates copy results from the first-seen candidate rather than being re-tested",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Repair trigger threshold",
|
|
"description": "States that repair triggers when fewer than 2 candidates pass (MIN_CORRECT_CANDIDATES=2), NOT when zero candidates pass or when any candidate fails",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Unmatched percentage limit",
|
|
"description": "Mentions REPAIR_UNMATCHED_PERCENTAGE_LIMIT as a condition that can cause repair to be skipped entirely, with effort-dependent values (0.2/0.3/0.4)",
|
|
"max_score": 25
|
|
}
|
|
]
|
|
}
|