codeflash-agent/evals/templates/ranking-hard/tests/test_pipeline.py
2026-04-03 17:36:50 -05:00

57 lines
2.1 KiB
Python

import pytest
from analytics.pipeline import run_pipeline
def test_basic():
config = {
"required_fields": ["id", "value", "category", "source"],
"blocked_sources": ["banned"],
"blocked_tags": ["spam"],
"min_score": 0,
"defaults": {"priority": 0},
}
records = [
{"id": 1, "value": 10.0, "category": "web", "source": "s1", "tags": []},
{"id": 2, "value": 20.0, "category": "web", "source": "s2", "tags": []},
{"id": 3, "value": 5.0, "category": "api", "source": "s1", "tags": ["spam"]},
{"id": 4, "value": 15.0, "category": "api", "source": "banned", "tags": []},
{"value": 1.0, "category": "x", "source": "s1", "tags": []}, # missing id
]
result = run_pipeline(records, config)
assert len(result["records"]) == 2 # filtered: missing id, blocked source, blocked tag
assert len(result["summary"]) > 0
def test_large_batch():
"""Production batch — run_pipeline is too slow on 5k records."""
categories = [f"cat-{i}" for i in range(50)]
sources = [f"source-{i}" for i in range(20)]
config = {
"required_fields": ["id", "value", "category", "source"],
"blocked_sources": [f"blocked-{i}" for i in range(15)],
"blocked_tags": [f"bad-{i}" for i in range(10)],
"min_score": 0,
"defaults": {
"priority": 0,
"region": "us-east",
"pipeline_version": "v3",
},
}
records = []
for i in range(5_000):
records.append(
{
"id": i,
"value": float(i % 200) / 10,
"category": categories[i % len(categories)],
"source": sources[i % len(sources)],
"tags": [f"tag-{i % 8}"],
"label": f"record value {i} with spaces",
"description": f"item-{i}.data-field",
}
)
result = run_pipeline(records, config)
assert len(result["records"]) == 5_000
assert len(result["summary"]) == 50
assert all(r["count"] == 100 for r in result["summary"].values())
assert all("source_overlap" in r for r in result["summary"].values())