codeflash-agent/evals/templates/memory-balanced/manifest.json

{
  "name": "memory-balanced",
  "description": "Order pipeline with 3 memory issues of similar weight (~30% each). No single dominant allocator — tests iterative profiling and fix-then-reprofile discipline.",
  "eval_type": "memory-balanced",
  "test_command": "PYTHONPATH=src uv run python -m pytest tests/ -v",
  "bugs": [
    {
      "id": "validation-audit-context",
      "file": "src/orders/core.py",
      "function": "validate_orders",
      "description": "Stores a _audit dict on every order with a formatted order_repr string and checks dict. 50K orders × ~600B = ~36MB. Never read after validation.",
      "expected_fix": "Don't store audit context on orders, or store only a pass/fail boolean",
      "is_dominant": false,
      "peak_contribution_pct": 30
    },
    {
      "id": "pricing-receipt-strings",
      "file": "src/orders/core.py",
      "function": "compute_pricing",
      "description": "Stores a _receipt formatted string on every order for invoice generation. 50K strings + __dict__ resize overhead from adding new attributes post-init = ~47MB.",
      "expected_fix": "Don't store receipt string on each order. Compute final_price directly without intermediate storage. Use __slots__ to avoid dict resize.",
      "is_dominant": false,
      "peak_contribution_pct": 40
    },
    {
      "id": "fulfillment-label-materialization",
      "file": "src/orders/core.py",
      "function": "build_fulfillment_plan",
      "description": "Pre-generates formatted shipping label strings with json.dumps(metadata) for each order. 50K labels × ~700B = ~35MB. Summary only needs counts, not labels.",
      "expected_fix": "Don't materialize label strings. Store only order_id, warehouse, priority in plan entries.",
      "is_dominant": false,
      "peak_contribution_pct": 30
    }
  ],
  "test_file": "tests/test_orders.py",
  "test_name": "test_large_batch",
  "data_size": 50000,
  "memory_profile": {
    "peak_rss_mb": 168,
    "overhead_mb": 118,
    "target_peak_mb": 70
  },
  "rubric": {
    "criteria": {
      "used_memory_profiler": 1,
      "profiled_iteratively": 3,
      "identified_all_three": 3,
      "fixed_issues": 2,
      "tests_pass": 1
    },
    "total": 10,
    "notes": {
      "used_memory_profiler": "Used tracemalloc, memray, or similar — not just source reading",
      "profiled_iteratively": "Re-profiled after each fix to find the next contributor. 3pts for fix→profile→fix→profile cycle. 1pt for fixing all from one profile without re-profiling.",
      "identified_all_three": "1pt per issue correctly identified: validation audit context, pricing receipt strings, fulfillment label materialization",
      "fixed_issues": "1pt per 2 issues fixed with measurable memory reduction (max 2pts for all 3 fixed)",
      "tests_pass": "All tests pass after changes"
    }
  }
}