codeflash-agent/evals/templates/memory-balanced/manifest.json
2026-04-03 17:36:50 -05:00

60 lines
2.8 KiB
JSON
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"name": "memory-balanced",
"description": "Order pipeline with 3 memory issues of similar weight (~30% each). No single dominant allocator — tests iterative profiling and fix-then-reprofile discipline.",
"eval_type": "memory-balanced",
"test_command": "PYTHONPATH=src uv run python -m pytest tests/ -v",
"bugs": [
{
"id": "validation-audit-context",
"file": "src/orders/core.py",
"function": "validate_orders",
"description": "Stores a _audit dict on every order with a formatted order_repr string and checks dict. 50K orders × ~600B = ~36MB. Never read after validation.",
"expected_fix": "Don't store audit context on orders, or store only a pass/fail boolean",
"is_dominant": false,
"peak_contribution_pct": 30
},
{
"id": "pricing-receipt-strings",
"file": "src/orders/core.py",
"function": "compute_pricing",
"description": "Stores a _receipt formatted string on every order for invoice generation. 50K strings + __dict__ resize overhead from adding new attributes post-init = ~47MB.",
"expected_fix": "Don't store receipt string on each order. Compute final_price directly without intermediate storage. Use __slots__ to avoid dict resize.",
"is_dominant": false,
"peak_contribution_pct": 40
},
{
"id": "fulfillment-label-materialization",
"file": "src/orders/core.py",
"function": "build_fulfillment_plan",
"description": "Pre-generates formatted shipping label strings with json.dumps(metadata) for each order. 50K labels × ~700B = ~35MB. Summary only needs counts, not labels.",
"expected_fix": "Don't materialize label strings. Store only order_id, warehouse, priority in plan entries.",
"is_dominant": false,
"peak_contribution_pct": 30
}
],
"test_file": "tests/test_orders.py",
"test_name": "test_large_batch",
"data_size": 50000,
"memory_profile": {
"peak_rss_mb": 168,
"overhead_mb": 118,
"target_peak_mb": 70
},
"rubric": {
"criteria": {
"used_memory_profiler": 1,
"profiled_iteratively": 3,
"identified_all_three": 3,
"fixed_issues": 2,
"tests_pass": 1
},
"total": 10,
"notes": {
"used_memory_profiler": "Used tracemalloc, memray, or similar — not just source reading",
"profiled_iteratively": "Re-profiled after each fix to find the next contributor. 3pts for fix→profile→fix→profile cycle. 1pt for fixing all from one profile without re-profiling.",
"identified_all_three": "1pt per issue correctly identified: validation audit context, pricing receipt strings, fulfillment label materialization",
"fixed_issues": "1pt per 2 issues fixed with measurable memory reduction (max 2pts for all 3 fixed)",
"tests_pass": "All tests pass after changes"
}
}
}