5 scenarios testing: sequential debugging, Result type + effort config, test patterns, domain type conventions, and deduplication/repair mechanics. Also adds tessl-labs/tessl-skill-eval-scenarios dev dependency.
26 lines
1.2 KiB
JSON
26 lines
1.2 KiB
JSON
{
|
|
"context": "Tests whether the agent follows the sequential debugging workflow from the skill, checking pipeline stages in order and using correct threshold values when diagnosing an optimization that produced no results.",
|
|
"type": "weighted_checklist",
|
|
"checklist": [
|
|
{
|
|
"name": "Sequential stage order",
|
|
"description": "Investigates pipeline stages in order: discovery before ranking before context before AI service before test failures. Does NOT jump to later stages without checking earlier ones first.",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Token limit value",
|
|
"description": "References the specific token limit of 16000 for OPTIMIZATION_CONTEXT_TOKEN_LIMIT or TESTGEN_CONTEXT_TOKEN_LIMIT when checking context extraction",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Importance threshold",
|
|
"description": "References DEFAULT_IMPORTANCE_THRESHOLD=0.001 when checking function ranking",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Stops at failure",
|
|
"description": "Identifies the failing stage and focuses investigation there rather than continuing through all remaining stages",
|
|
"max_score": 25
|
|
}
|
|
]
|
|
}
|