5 scenarios testing: code serialization format, candidate lifecycle/DAG, deterministic patches, effort levels/selection criteria, and function representation/concurrency model.
26 lines
1.1 KiB
JSON
26 lines
1.1 KiB
JSON
{
|
|
"context": "Tests whether the agent knows the different OptimizedCandidateSource types and how candidates form a DAG via parent_id references in the codeflash pipeline.",
|
|
"type": "weighted_checklist",
|
|
"checklist": [
|
|
{
|
|
"name": "Lists source types",
|
|
"description": "Identifies at least 4 of the 6 OptimizedCandidateSource variants: OPTIMIZE, OPTIMIZE_LP, REFINE, REPAIR, ADAPTIVE, JIT_REWRITE",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Parent ID linkage",
|
|
"description": "Explains that REFINE and REPAIR candidates reference their parent via parent_id, creating a DAG/forest structure, NOT independent candidates",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Refinement uses runtime data",
|
|
"description": "States that refinement sends runtime data and line profiler results to the AI service (AIServiceRefinerRequest), NOT just the source code",
|
|
"max_score": 25
|
|
},
|
|
{
|
|
"name": "Repair uses test diffs",
|
|
"description": "States that repair sends test failure diffs (TestDiff with scope: RETURN_VALUE/STDOUT/DID_PASS) to the AI service, NOT just error messages",
|
|
"max_score": 25
|
|
}
|
|
]
|
|
}
|