mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
57 lines
2.1 KiB
Python
57 lines
2.1 KiB
Python
import pytest
|
|
from analytics.pipeline import run_pipeline
|
|
|
|
|
|
def test_basic():
|
|
config = {
|
|
"required_fields": ["id", "value", "category", "source"],
|
|
"blocked_sources": ["banned"],
|
|
"blocked_tags": ["spam"],
|
|
"min_score": 0,
|
|
"defaults": {"priority": 0},
|
|
}
|
|
records = [
|
|
{"id": 1, "value": 10.0, "category": "web", "source": "s1", "tags": []},
|
|
{"id": 2, "value": 20.0, "category": "web", "source": "s2", "tags": []},
|
|
{"id": 3, "value": 5.0, "category": "api", "source": "s1", "tags": ["spam"]},
|
|
{"id": 4, "value": 15.0, "category": "api", "source": "banned", "tags": []},
|
|
{"value": 1.0, "category": "x", "source": "s1", "tags": []}, # missing id
|
|
]
|
|
result = run_pipeline(records, config)
|
|
assert len(result["records"]) == 2 # filtered: missing id, blocked source, blocked tag
|
|
assert len(result["summary"]) > 0
|
|
|
|
|
|
def test_large_batch():
|
|
"""Production batch — run_pipeline is too slow on 5k records."""
|
|
categories = [f"cat-{i}" for i in range(50)]
|
|
sources = [f"source-{i}" for i in range(20)]
|
|
config = {
|
|
"required_fields": ["id", "value", "category", "source"],
|
|
"blocked_sources": [f"blocked-{i}" for i in range(15)],
|
|
"blocked_tags": [f"bad-{i}" for i in range(10)],
|
|
"min_score": 0,
|
|
"defaults": {
|
|
"priority": 0,
|
|
"region": "us-east",
|
|
"pipeline_version": "v3",
|
|
},
|
|
}
|
|
records = []
|
|
for i in range(5_000):
|
|
records.append(
|
|
{
|
|
"id": i,
|
|
"value": float(i % 200) / 10,
|
|
"category": categories[i % len(categories)],
|
|
"source": sources[i % len(sources)],
|
|
"tags": [f"tag-{i % 8}"],
|
|
"label": f"record value {i} with spaces",
|
|
"description": f"item-{i}.data-field",
|
|
}
|
|
)
|
|
result = run_pipeline(records, config)
|
|
assert len(result["records"]) == 5_000
|
|
assert len(result["summary"]) == 50
|
|
assert all(r["count"] == 100 for r in result["summary"].values())
|
|
assert all("source_overlap" in r for r in result["summary"].values())
|