codeflash-agent/codeflash-evals/templates/memory-hard/tests/test_pipeline.py
Kevin Turcios 37efa524d7 feat: improve skill, eval system, and tessl config
- Optimize codeflash-optimize SKILL.md (review score 17% → 98%, eval 87% → 100%)
  - Fix frontmatter (allowed-tools format, argument-hint under metadata)
  - Lead description with concrete actions, explicit agent launch parameters
- Add multi-run variance detection to eval system (--runs N flag)
  - score.py aggregate command: min/max/avg/stddev per criterion, flaky detection
  - check-regression.sh defaults to 3 runs for reliable regression detection
- Add per-criterion regression tracking to baseline-scores.json (v3)
  - Reports exactly which criteria regressed, not just total score drops
- Rename evals/ → codeflash-evals/ to avoid tessl directory conflicts
- Switch tessl to managed mode, gitignore vendored tiles and symlinks
2026-03-27 11:30:17 -05:00

62 lines
2.5 KiB
Python

import pytest
from pipeline.core import process_readings
def test_basic():
raw = [
{"id": 1, "sensor_type": "temp", "timestamp": "2024-01-01T00:00:00",
"value": 22.5, "metadata": {"location": {"lat": 0, "lng": 0}}},
{"id": 2, "sensor_type": "temp", "timestamp": "2024-01-01T01:00:00",
"value": 23.0, "metadata": {"location": {"lat": 0, "lng": 0}}},
{"id": 3, "sensor_type": "humidity", "timestamp": "2024-01-01T00:00:00",
"value": 45.0, "metadata": {"location": {"lat": 0, "lng": 0}}},
]
result = process_readings(raw)
assert len(result) == 2
assert result["temp"]["count"] == 2
assert result["humidity"]["count"] == 1
def test_large_dataset():
"""Production-scale dataset — process_readings uses too much memory.
With 50k readings, peak memory is far higher than the input data size.
The goal is to reduce memory usage while preserving correctness.
"""
sensor_types = [f"sensor-{i}" for i in range(50)]
raw = []
for i in range(50_000):
raw.append(
{
"id": i,
"sensor_type": sensor_types[i % len(sensor_types)],
"timestamp": f"2024-{(i % 12) + 1:02d}-{(i % 28) + 1:02d}T{(i % 24):02d}:00:00",
"value": round(20.0 + (i % 100) * 0.1, 2),
"unit": "celsius",
"metadata": {
"location": {
"lat": round(37.0 + (i % 10) * 0.1, 4),
"lng": round(-122.0 + (i % 10) * 0.1, 4),
"altitude": float(i % 50),
},
"firmware": f"v{1 + i % 3}.{i % 10}.0",
"calibration_date": "2024-01-15",
"sensor_config": {
"sample_rate": 100,
"precision": "high",
"filter": "kalman",
},
"deployment": {
"site": f"site-{i % 10}",
"rack": f"rack-{chr(65 + i % 8)}",
"position": i % 20,
},
"tags": [f"tag-{i % 5}", f"tag-{(i + 1) % 5}"],
"history": [f"event-{j}" for j in range(3)],
},
}
)
result = process_readings(raw)
assert len(result) == 50
assert all(r["count"] == 1000 for r in result.values())
assert all(r["mean"] > 0 for r in result.values())