codeflash/tiles/codeflash-docs/evals/scenario-3/criteria.json
Kevin Turcios 869fbe1766 chore: add eval scenarios for codeflash-docs tile
5 scenarios testing: code serialization format, candidate lifecycle/DAG,
deterministic patches, effort levels/selection criteria, and function
representation/concurrency model.
2026-02-14 21:29:22 -05:00

31 lines
1.2 KiB
JSON

{
"context": "Tests whether the agent knows the specific deterministic patch values used in codeflash's pytest plugin and the subprocess-based test execution architecture.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Subprocess isolation",
"description": "States that tests run in a subprocess to isolate the test environment from the main codeflash process, NOT in the same process",
"max_score": 20
},
{
"name": "Fixed time value",
"description": "References the specific fixed timestamp 1761717605.108106 for time.time() or the fixed datetime 2021-01-01 02:05:10 UTC for datetime.now()",
"max_score": 20
},
{
"name": "Fixed UUID value",
"description": "References the specific fixed UUID 12345678-1234-5678-9abc-123456789012 for uuid4/uuid1",
"max_score": 20
},
{
"name": "Random seed",
"description": "States that random is seeded with 42 (NOT a different seed value)",
"max_score": 20
},
{
"name": "Plugin blocklists",
"description": "Mentions that behavioral tests block specific pytest plugins (at least 2 of: benchmark, codspeed, xdist, sugar) to ensure deterministic execution",
"max_score": 20
}
]
}