codeflash-agent/codeflash-evals/templates/ranking/tests/test_pipeline.py
Kevin Turcios 37efa524d7 feat: improve skill, eval system, and tessl config
- Optimize codeflash-optimize SKILL.md (review score 17% → 98%, eval 87% → 100%)
  - Fix frontmatter (allowed-tools format, argument-hint under metadata)
  - Lead description with concrete actions, explicit agent launch parameters
- Add multi-run variance detection to eval system (--runs N flag)
  - score.py aggregate command: min/max/avg/stddev per criterion, flaky detection
  - check-regression.sh defaults to 3 runs for reliable regression detection
- Add per-criterion regression tracking to baseline-scores.json (v3)
  - Reports exactly which criteria regressed, not just total score drops
- Rename evals/ → codeflash-evals/ to avoid tessl directory conflicts
- Switch tessl to managed mode, gitignore vendored tiles and symlinks
2026-03-27 11:30:17 -05:00

44 lines
1.6 KiB
Python

import pytest
from pipeline.core import run_pipeline
def test_basic():
config = {
"required_fields": ["id", "value", "category"],
"blocked_ids": [999],
}
records = [
{"id": 1, "value": "hello world", "category": "a", "base_score": 1.0},
{"id": 2, "value": "foo", "category": "a", "base_score": 2.0},
{"id": 999, "value": "blocked", "category": "b", "base_score": 1.0},
{"id": 3, "value": "bar", "category": "b"}, # missing base_score is ok (defaults to 1.0)
{"value": "no id"}, # missing required field
]
result = run_pipeline(records, config)
assert len(result) == 3
# blocked and missing-id records filtered
assert all(r["id"] != 999 for r in result)
def test_large_batch():
"""Production batch — run_pipeline is too slow on 5k records."""
config = {
"required_fields": ["id", "value", "category", "source"],
"blocked_ids": list(range(9000, 9100)), # 100 blocked IDs
}
categories = [f"cat-{i}" for i in range(20)]
sources = [f"source-{i}" for i in range(10)]
records = []
for i in range(5_000):
records.append(
{
"id": i,
"value": f"record value {i} with extra spaces",
"category": categories[i % len(categories)],
"source": sources[i % len(sources)],
"base_score": float(i % 100) / 10,
}
)
result = run_pipeline(records, config)
assert len(result) == 5_000 # none blocked (IDs 0-4999, blocklist 9000-9099)
assert all("score" in r for r in result)