mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
* feat(blackbox): add package with models, CLI, and HTMX dashboard * test(blackbox): add comprehensive test coverage for dashboard * feat(blackbox): cache session scanning via watcher invalidation * docs(blackbox): add README and use fastapi[standard] for dev server * refactor(blackbox): extract presentation logic into formatter classes * refactor(blackbox): extract classify_error helpers * feat(blackbox): wire analytics into session detail view Show token usage, tool breakdowns, and session stats in a collapsible panel when viewing a session. * feat(blackbox): add codeflash plugin detection Detect codeflash agent names, skills, and commands in transcripts. Surface language, optimization domain, and capability badges in the analytics panel. * refactor(blackbox): remove underscore prefixes from internal functions * chore: add ty python-version to root pyproject.toml * chore(blackbox): fix lint errors in test files * style(blackbox): apply ruff formatting to analytics * feat(blackbox): add Playwright E2E tests for dashboard Refactor app.py to expose create_app() factory accepting a projects_dir override, enabling tests to run against fixture data instead of the real ~/.claude/projects/ directory. Routes now read projects_dir from app.state instead of the module-level constant. Add 26 Playwright tests across 5 files covering dashboard loading, session list, session detail with filters and analytics, sidebar collapse/localStorage persistence, and SSE log streaming. All tests pass on chromium, firefox, and webkit (78 total). CI gets a new e2e-blackbox job with a browser matrix strategy running all three engines in parallel, conditional on blackbox path changes, with trace upload on failure. * fix(ci): sync only blackbox package in e2e job * fix(ci): exclude e2e tests from unit test job The test job doesn't install Playwright browsers, so e2e tests error when pytest collects them. Ignore tests/e2e/ directories in the test job — those are handled by the dedicated e2e-blackbox job.
305 lines
10 KiB
Python
305 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
import attrs
|
|
import pytest
|
|
|
|
from blackbox.models import (
|
|
ProjectStats,
|
|
Recommendation,
|
|
SessionAudit,
|
|
SessionDigest,
|
|
SessionEvent,
|
|
WeekStats,
|
|
arrow,
|
|
sparkline,
|
|
)
|
|
from tests.conftest import make_audit, make_meta
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# sparkline and arrow
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSparkline:
|
|
def test_empty_or_single_returns_empty(self) -> None:
|
|
assert "" == sparkline([])
|
|
assert "" == sparkline([1.0])
|
|
|
|
def test_ascending_produces_increasing_chars(self) -> None:
|
|
result = sparkline([0.0, 0.5, 1.0])
|
|
assert len(result) == 3
|
|
assert result[0] <= result[-1]
|
|
|
|
def test_descending_produces_decreasing_chars(self) -> None:
|
|
result = sparkline([1.0, 0.5, 0.0])
|
|
assert result[0] >= result[-1]
|
|
|
|
def test_constant_values_produce_middle_char(self) -> None:
|
|
result = sparkline([5.0, 5.0, 5.0])
|
|
assert len(result) == 3
|
|
assert len(set(result)) == 1
|
|
|
|
def test_two_values_uses_full_range(self) -> None:
|
|
result = sparkline([0.0, 1.0])
|
|
assert len(result) == 2
|
|
assert result[0] != result[-1]
|
|
|
|
|
|
class TestArrow:
|
|
def test_near_zero_delta_returns_equals(self) -> None:
|
|
assert "=" == arrow(0.0)
|
|
assert "=" == arrow(0.04)
|
|
assert "=" == arrow(-0.04)
|
|
|
|
def test_positive_delta_returns_up(self) -> None:
|
|
assert "^" == arrow(0.1)
|
|
|
|
def test_negative_delta_returns_down(self) -> None:
|
|
assert "v" == arrow(-0.1)
|
|
|
|
def test_invert_flips_positive(self) -> None:
|
|
assert "v" == arrow(0.1, invert=True)
|
|
|
|
def test_invert_flips_negative(self) -> None:
|
|
assert "^" == arrow(-0.1, invert=True)
|
|
|
|
def test_invert_near_zero_still_equals(self) -> None:
|
|
assert "=" == arrow(0.0, invert=True)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SessionEvent
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSessionEvent:
|
|
def test_construction(self) -> None:
|
|
e = SessionEvent(
|
|
timestamp="2024-01-01T00:00:00Z",
|
|
speaker="user",
|
|
text="hello",
|
|
tool_name=None,
|
|
file_path=None,
|
|
command=None,
|
|
is_error=False,
|
|
error_category=None,
|
|
attachment_type=None,
|
|
)
|
|
assert e.speaker == "user"
|
|
assert e.text == "hello"
|
|
assert not e.is_error
|
|
|
|
def test_frozen(self) -> None:
|
|
e = SessionEvent("ts", "user", "hi", None, None, None, False, None, None)
|
|
with pytest.raises(attrs.exceptions.FrozenInstanceError):
|
|
e.speaker = "assistant" # type: ignore[misc]
|
|
|
|
def test_equality(self) -> None:
|
|
e1 = SessionEvent("ts", "user", "hi", None, None, None, False, None, None)
|
|
e2 = SessionEvent("ts", "user", "hi", None, None, None, False, None, None)
|
|
assert e1 == e2
|
|
|
|
def test_attrs_asdict(self) -> None:
|
|
e = SessionEvent("ts", "user", "hi", None, None, None, False, None, None)
|
|
d = attrs.asdict(e)
|
|
assert d["speaker"] == "user"
|
|
assert d["text"] == "hi"
|
|
assert json.dumps(d) # JSON-serializable
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SessionMeta — properties
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSessionMetaProperties:
|
|
def test_duration_minutes(self) -> None:
|
|
assert make_meta(duration_s=3600.0).duration_minutes == 60.0
|
|
|
|
def test_duration_minutes_zero(self) -> None:
|
|
assert make_meta(duration_s=0.0).duration_minutes == 0.0
|
|
|
|
def test_total_tokens(self) -> None:
|
|
assert make_meta(input_tokens=1000, output_tokens=500).total_tokens == 1500
|
|
|
|
def test_total_tokens_default(self) -> None:
|
|
assert make_meta().total_tokens == 0
|
|
|
|
def test_cache_hit_rate(self) -> None:
|
|
meta = make_meta(input_tokens=500, cache_read_tokens=300, cache_creation_tokens=200)
|
|
assert meta.cache_hit_rate == 0.3
|
|
|
|
def test_cache_hit_rate_zero_tokens(self) -> None:
|
|
assert make_meta().cache_hit_rate == 0.0
|
|
|
|
def test_cache_hit_rate_full(self) -> None:
|
|
meta = make_meta(input_tokens=0, cache_read_tokens=1000, cache_creation_tokens=0)
|
|
assert meta.cache_hit_rate == 1.0
|
|
|
|
|
|
class TestSessionMetaFrozen:
|
|
def test_frozen(self) -> None:
|
|
meta = make_meta()
|
|
with pytest.raises(attrs.exceptions.FrozenInstanceError):
|
|
meta.session_id = "new" # type: ignore[misc]
|
|
|
|
|
|
class TestSessionMetaAsDict:
|
|
def test_returns_dict(self) -> None:
|
|
d = attrs.asdict(make_meta())
|
|
assert isinstance(d, dict)
|
|
assert d["session_id"] == "abcd1234-5678-9012-3456-789012345678"
|
|
assert d["duration_s"] == 3600.0
|
|
|
|
def test_includes_optional_fields(self) -> None:
|
|
d = attrs.asdict(make_meta(git_branch="feature", git_commits=3))
|
|
assert d["git_branch"] == "feature"
|
|
assert d["git_commits"] == 3
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SessionAudit
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSessionAuditDefaults:
|
|
def test_defaults(self) -> None:
|
|
a = SessionAudit(session_id="x")
|
|
assert a.outcome == "unclear"
|
|
assert a.satisfaction == "neutral"
|
|
assert a.session_type == "single_task"
|
|
assert a.goal_categories == {}
|
|
assert a.friction_counts == {}
|
|
assert a.user_instructions == ()
|
|
assert a.summary == ""
|
|
|
|
def test_frozen(self) -> None:
|
|
a = SessionAudit(session_id="x")
|
|
with pytest.raises(attrs.exceptions.FrozenInstanceError):
|
|
a.outcome = "success" # type: ignore[misc]
|
|
|
|
|
|
class TestSessionAuditAsDict:
|
|
def test_returns_dict(self) -> None:
|
|
a = make_audit()
|
|
d = attrs.asdict(a)
|
|
assert isinstance(d, dict)
|
|
assert d["outcome"] == "mostly_achieved"
|
|
|
|
def test_reflects_values(self) -> None:
|
|
a = make_audit(outcome="success", satisfaction="positive", session_type="multi_task")
|
|
d = attrs.asdict(a)
|
|
assert d["outcome"] == "success"
|
|
assert d["session_type"] == "multi_task"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ProjectStats — mutable is_outlier
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestProjectStats:
|
|
def make(self, **kw: Any) -> ProjectStats:
|
|
defaults: dict[str, Any] = {
|
|
"project_path": "/proj/myapp",
|
|
"project_name": "myapp",
|
|
"session_count": 10,
|
|
"success_rate": 0.9,
|
|
"avg_tool_errors": 2.5,
|
|
"avg_duration_s": 600.0,
|
|
"top_error_categories": (),
|
|
"top_friction": (),
|
|
}
|
|
defaults.update(kw)
|
|
return ProjectStats(**defaults)
|
|
|
|
def test_is_outlier_default_false(self) -> None:
|
|
assert not self.make().is_outlier
|
|
|
|
def test_is_outlier_mutable(self) -> None:
|
|
p = self.make()
|
|
p.is_outlier = True
|
|
assert p.is_outlier
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# WeekStats + Recommendation
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestWeekStats:
|
|
def test_frozen(self) -> None:
|
|
w = WeekStats(
|
|
week="2026-W17", session_count=3, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
|
|
)
|
|
with pytest.raises(attrs.exceptions.FrozenInstanceError):
|
|
w.session_count = 5 # type: ignore[misc]
|
|
|
|
def test_default_error_counts(self) -> None:
|
|
w = WeekStats(
|
|
week="2026-W17", session_count=3, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
|
|
)
|
|
assert w.error_category_counts == {}
|
|
|
|
|
|
class TestRecommendation:
|
|
def test_frozen(self) -> None:
|
|
r = Recommendation(suggestion="do X", evidence="50%", frequency=0.5, source_sessions=5)
|
|
with pytest.raises(attrs.exceptions.FrozenInstanceError):
|
|
r.suggestion = "do Y" # type: ignore[misc]
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# SessionDigest
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestSessionDigest:
|
|
def make(self, **kw: Any) -> SessionDigest:
|
|
defaults: dict[str, Any] = {"session_count": 10, "date_range": (100.0, 500.0), "success_rate": 0.8}
|
|
defaults.update(kw)
|
|
return SessionDigest(**defaults)
|
|
|
|
def test_attrs_asdict(self) -> None:
|
|
d = attrs.asdict(self.make())
|
|
assert d["session_count"] == 10
|
|
assert d["success_rate"] == 0.8
|
|
|
|
def test_frozen(self) -> None:
|
|
d = self.make()
|
|
with pytest.raises(attrs.exceptions.FrozenInstanceError):
|
|
d.session_count = 99 # type: ignore[misc]
|
|
|
|
def test_json_serializable(self) -> None:
|
|
j = json.dumps(attrs.asdict(self.make(session_count=5)), indent=2, default=str)
|
|
parsed = json.loads(j)
|
|
assert parsed["session_count"] == 5
|
|
|
|
def test_json_with_nested_weeks(self) -> None:
|
|
w = WeekStats(
|
|
week="2026-W17", session_count=3, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
|
|
)
|
|
j = json.dumps(attrs.asdict(self.make(weeks=(w,))), indent=2, default=str)
|
|
parsed = json.loads(j)
|
|
assert len(parsed["weeks"]) == 1
|
|
assert parsed["weeks"][0]["week"] == "2026-W17"
|
|
|
|
def test_json_with_nested_projects(self) -> None:
|
|
p = ProjectStats(
|
|
project_path="/p",
|
|
project_name="p",
|
|
session_count=5,
|
|
success_rate=0.8,
|
|
avg_tool_errors=1.0,
|
|
avg_duration_s=300.0,
|
|
top_error_categories=(),
|
|
top_friction=(),
|
|
is_outlier=False,
|
|
)
|
|
j = json.dumps(attrs.asdict(self.make(projects=(p,))), indent=2, default=str)
|
|
parsed = json.loads(j)
|
|
assert len(parsed["projects"]) == 1
|
|
assert parsed["projects"][0]["project_name"] == "p"
|