5 scenarios testing: code serialization format, candidate lifecycle/DAG, deterministic patches, effort levels/selection criteria, and function representation/concurrency model.
21 lines
1 KiB
JSON
21 lines
1 KiB
JSON
{
|
|
"context": "Tests whether the agent knows the CodeStringsMarkdown serialization format and the distinction between read-writable and read-only code context in the codeflash pipeline.",
|
|
"type": "weighted_checklist",
|
|
"checklist": [
|
|
{
|
|
"name": "Markdown code block format",
|
|
"description": "Uses the correct fenced code block format with language:filepath syntax (```python:path/to/file.py) when constructing code for the AI service, NOT plain code blocks without file paths",
|
|
"max_score": 30
|
|
},
|
|
{
|
|
"name": "Read-writable vs read-only split",
|
|
"description": "Correctly separates code into read_writable_code (code the LLM can modify) and read_only_context_code (reference-only dependency code), NOT treating all code as modifiable",
|
|
"max_score": 35
|
|
},
|
|
{
|
|
"name": "parse_markdown_code usage",
|
|
"description": "Uses CodeStringsMarkdown.parse_markdown_code() to parse AI service responses back into structured code, NOT manual string splitting or regex",
|
|
"max_score": 35
|
|
}
|
|
]
|
|
}
|