codeflash-agent/packages/blackbox/tests/test_formatting.py
Kevin Turcios 0ad5e60523
Add blackbox package: session flight recorder with HTMX dashboard (#39)
* feat(blackbox): add package with models, CLI, and HTMX dashboard

* test(blackbox): add comprehensive test coverage for dashboard

* feat(blackbox): cache session scanning via watcher invalidation

* docs(blackbox): add README and use fastapi[standard] for dev server

* refactor(blackbox): extract presentation logic into formatter classes

* refactor(blackbox): extract classify_error helpers

* feat(blackbox): wire analytics into session detail view

Show token usage, tool breakdowns, and session stats in a
collapsible panel when viewing a session.

* feat(blackbox): add codeflash plugin detection

Detect codeflash agent names, skills, and commands in transcripts.
Surface language, optimization domain, and capability badges in
the analytics panel.

* refactor(blackbox): remove underscore prefixes from internal functions

* chore: add ty python-version to root pyproject.toml

* chore(blackbox): fix lint errors in test files

* style(blackbox): apply ruff formatting to analytics

* feat(blackbox): add Playwright E2E tests for dashboard

Refactor app.py to expose create_app() factory accepting a projects_dir
override, enabling tests to run against fixture data instead of the real
~/.claude/projects/ directory. Routes now read projects_dir from
app.state instead of the module-level constant.

Add 26 Playwright tests across 5 files covering dashboard loading,
session list, session detail with filters and analytics, sidebar
collapse/localStorage persistence, and SSE log streaming. All tests
pass on chromium, firefox, and webkit (78 total).

CI gets a new e2e-blackbox job with a browser matrix strategy running
all three engines in parallel, conditional on blackbox path changes,
with trace upload on failure.

* fix(ci): sync only blackbox package in e2e job

* fix(ci): exclude e2e tests from unit test job

The test job doesn't install Playwright browsers, so e2e tests error
when pytest collects them. Ignore tests/e2e/ directories in the test
job — those are handled by the dedicated e2e-blackbox job.
2026-04-28 19:58:43 -05:00

333 lines
13 KiB
Python

from __future__ import annotations
import json
from typing import Any
from blackbox.formatting import (
AuditFormatter,
DigestFormatter,
MetaFormatter,
ProjectFormatter,
RecommendationFormatter,
)
from blackbox.models import (
ProjectStats,
Recommendation,
SessionAudit,
SessionDigest,
WeekStats,
)
from tests.conftest import make_meta
# ---------------------------------------------------------------------------
# MetaFormatter
# ---------------------------------------------------------------------------
class TestMetaFormatter:
def test_basic(self) -> None:
text = MetaFormatter(make_meta(input_tokens=5000, output_tokens=2000, tool_errors=2)).summary()
assert "abcd1234" in text
assert "60min" in text
assert "10 user / 12 assistant" in text
assert "25 calls (2 errors)" in text
assert "5,000 in / 2,000 out" in text
def test_with_git(self) -> None:
assert "5 commits on main" in MetaFormatter(make_meta(git_commits=5, git_branch="main")).summary()
def test_git_without_branch(self) -> None:
assert "unknown" in MetaFormatter(make_meta(git_commits=1, git_branch=None)).summary()
def test_with_files(self) -> None:
text = MetaFormatter(make_meta(files_modified=3, lines_added=100, lines_removed=20)).summary()
assert "3 modified" in text
assert "+100/-20" in text
def test_without_files(self) -> None:
assert "modified" not in MetaFormatter(make_meta(files_modified=0)).summary()
def test_with_compactions(self) -> None:
assert "Compactions: 3" in MetaFormatter(make_meta(compactions=3)).summary()
def test_without_compactions(self) -> None:
assert "Compactions" not in MetaFormatter(make_meta(compactions=0)).summary()
def test_with_interruptions(self) -> None:
assert "Interruptions: 2" in MetaFormatter(make_meta(user_interruptions=2)).summary()
def test_without_interruptions(self) -> None:
assert "Interruptions" not in MetaFormatter(make_meta(user_interruptions=0)).summary()
def test_top_tools_capped_at_5(self) -> None:
meta = make_meta(tool_counts={"Read": 20, "Edit": 15, "Bash": 10, "Write": 5, "Grep": 3, "X": 1})
text = MetaFormatter(meta).summary()
assert "Read=20" in text
assert "X=1" not in text
def test_no_top_tools_when_empty(self) -> None:
assert "Top tools" not in MetaFormatter(make_meta(tool_counts={})).summary()
def test_thinking_blocks_shown_when_nonzero(self) -> None:
assert "Thinking blocks: 5" in MetaFormatter(make_meta(thinking_blocks=5)).summary()
def test_thinking_blocks_hidden_when_zero(self) -> None:
assert "Thinking blocks" not in MetaFormatter(make_meta(thinking_blocks=0)).summary()
def test_web_shown_when_nonzero(self) -> None:
text = MetaFormatter(make_meta(web_searches=3, web_fetches=1)).summary()
assert "Web: 3 searches / 1 fetches" in text
def test_web_hidden_when_zero(self) -> None:
assert "Web:" not in MetaFormatter(make_meta(web_searches=0, web_fetches=0)).summary()
def test_permission_mode_shown_when_set(self) -> None:
text = MetaFormatter(make_meta(permission_mode="bypassPermissions")).summary()
assert "Permission mode: bypassPermissions" in text
def test_permission_mode_hidden_when_none(self) -> None:
assert "Permission mode" not in MetaFormatter(make_meta(permission_mode=None)).summary()
# ---------------------------------------------------------------------------
# AuditFormatter
# ---------------------------------------------------------------------------
class TestAuditFormatter:
def test_basic(self) -> None:
a = SessionAudit(
session_id="abcd1234-5678",
outcome="success",
satisfaction="positive",
session_type="debugging",
)
text = AuditFormatter(a).summary()
assert "abcd1234" in text
assert "Outcome: success" in text
assert "Satisfaction: positive" in text
assert "Type: debugging" in text
def test_with_goals(self) -> None:
a = SessionAudit(session_id="x", goal_categories={"bugfix": 5, "refactor": 3})
text = AuditFormatter(a).summary()
assert "Goals:" in text
assert "bugfix(5)" in text
def test_without_goals(self) -> None:
assert "Goals" not in AuditFormatter(SessionAudit(session_id="x", goal_categories={})).summary()
def test_with_friction(self) -> None:
a = SessionAudit(session_id="x", friction_counts={"permission_denied": 4})
assert "permission_denied(4)" in AuditFormatter(a).summary()
def test_without_friction(self) -> None:
assert "Friction" not in AuditFormatter(SessionAudit(session_id="x")).summary()
def test_with_instructions(self) -> None:
a = SessionAudit(session_id="x", user_instructions=("use pytest", "no comments"))
assert "Instructions: 2 extracted" in AuditFormatter(a).summary()
def test_without_instructions(self) -> None:
assert "Instructions" not in AuditFormatter(SessionAudit(session_id="x")).summary()
def test_summary_truncated_at_120(self) -> None:
a = SessionAudit(session_id="x", summary="x" * 200)
text = AuditFormatter(a).summary()
summary_line = next(line for line in text.split("\n") if "Summary" in line)
assert len(summary_line.split("Summary: ")[1]) == 120
# ---------------------------------------------------------------------------
# RecommendationFormatter
# ---------------------------------------------------------------------------
class TestRecommendationFormatter:
def test_basic(self) -> None:
r = Recommendation(suggestion="do X", evidence="50% failure", frequency=0.5, source_sessions=5)
text = RecommendationFormatter(r).summary()
assert "do X" in text
assert "50% failure" in text
# ---------------------------------------------------------------------------
# ProjectFormatter
# ---------------------------------------------------------------------------
class TestProjectFormatter:
def make(self, **kw: Any) -> ProjectStats:
defaults: dict[str, Any] = {
"project_path": "/proj/myapp",
"project_name": "myapp",
"session_count": 10,
"success_rate": 0.9,
"avg_tool_errors": 2.5,
"avg_duration_s": 600.0,
"top_error_categories": (),
"top_friction": (),
}
defaults.update(kw)
return ProjectStats(**defaults)
def test_basic(self) -> None:
text = ProjectFormatter(self.make()).summary()
assert "myapp: 10 sessions" in text
assert "90% success" in text
def test_outlier_marker(self) -> None:
assert "[!]" in ProjectFormatter(self.make(is_outlier=True)).summary()
def test_error_categories_shown(self) -> None:
p = self.make(top_error_categories=(("edit_failed", 8), ("command_failed", 3)))
assert "Errors: edit_failed(8)" in ProjectFormatter(p).summary()
def test_friction_shown(self) -> None:
p = self.make(top_friction=(("user_rejected", 4),))
assert "Friction: user_rejected(4)" in ProjectFormatter(p).summary()
def test_no_sub_lines_when_clean(self) -> None:
text = ProjectFormatter(self.make()).summary()
assert len(text.strip().split("\n")) == 1
# ---------------------------------------------------------------------------
# DigestFormatter
# ---------------------------------------------------------------------------
class TestDigestFormatter:
def make(self, **kw: Any) -> SessionDigest:
defaults: dict[str, Any] = {"session_count": 10, "date_range": (100.0, 500.0), "success_rate": 0.8}
defaults.update(kw)
return SessionDigest(**defaults)
def test_includes_count(self) -> None:
assert "42 sessions" in DigestFormatter(self.make(session_count=42)).summary()
def test_success_rate(self) -> None:
assert "80% success rate" in DigestFormatter(self.make(success_rate=0.8)).summary()
def test_outcome_distribution(self) -> None:
digest = self.make(
session_count=10,
outcome_distribution={"fully_achieved": 7, "unclear": 3},
)
text = DigestFormatter(digest).summary()
assert "fully_achieved: 7 (70%)" in text
def test_no_trends_without_weeks(self) -> None:
assert "Trends" not in DigestFormatter(self.make()).summary()
def test_trends_with_weeks(self) -> None:
w = WeekStats(
week="2026-W17", session_count=5, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
)
digest = self.make(weeks=(w,), rolling_success_rate=0.7)
text = DigestFormatter(digest).summary()
assert "Trends" in text
assert "2026-W17" in text
def test_no_projects_without_data(self) -> None:
assert "Projects" not in DigestFormatter(self.make()).summary()
def test_no_recommendations_without_data(self) -> None:
assert "Recommendations" not in DigestFormatter(self.make()).summary()
def test_with_recommendations(self) -> None:
r = Recommendation(suggestion="Fix the thing", evidence="50% failure", frequency=0.5, source_sessions=10)
text = DigestFormatter(self.make(recommendations=(r,))).summary()
assert "Recommendations" in text
assert "1. Fix the thing" in text
def test_satisfaction_distribution(self) -> None:
digest = self.make(
session_count=10,
satisfaction_distribution={"happy": 6, "neutral": 4},
)
text = DigestFormatter(digest).summary()
assert "Satisfaction:" in text
assert "happy: 6" in text
def test_top_friction(self) -> None:
digest = self.make(top_friction=(("tool_failed", 12), ("blocked", 3)))
text = DigestFormatter(digest).summary()
assert "Top friction:" in text
assert "tool_failed: 12" in text
def test_sparkline_with_two_weeks(self) -> None:
w1 = WeekStats(
week="2026-W16", session_count=3, success_rate=0.5, avg_errors_per_session=2.0, avg_duration_s=600.0
)
w2 = WeekStats(
week="2026-W17", session_count=4, success_rate=0.9, avg_errors_per_session=0.5, avg_duration_s=400.0
)
text = DigestFormatter(self.make(weeks=(w1, w2), rolling_success_rate=0.7)).summary()
assert "Success: [" in text
assert "Errors: [" in text
def test_error_category_deltas(self) -> None:
w = WeekStats(
week="2026-W17", session_count=3, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
)
digest = self.make(
weeks=(w,),
error_category_deltas=(("command_failed", 0.5, 4.0, 6.0),),
)
text = DigestFormatter(digest).summary()
assert "Error category trends:" in text
assert "command_failed" in text
def test_with_projects(self) -> None:
p = ProjectStats(
project_path="/proj/myapp",
project_name="myapp",
session_count=5,
success_rate=0.8,
avg_tool_errors=1.0,
avg_duration_s=300.0,
top_error_categories=(),
top_friction=(),
)
text = DigestFormatter(self.make(projects=(p,))).summary()
assert "Projects (1)" in text
assert "myapp" in text
class TestDigestToJson:
def make(self, **kw: Any) -> SessionDigest:
defaults: dict[str, Any] = {"session_count": 10, "date_range": (100.0, 500.0), "success_rate": 0.8}
defaults.update(kw)
return SessionDigest(**defaults)
def test_valid_json(self) -> None:
j = DigestFormatter(self.make(session_count=5)).to_json()
parsed = json.loads(j)
assert parsed["session_count"] == 5
def test_with_nested_weeks(self) -> None:
w = WeekStats(
week="2026-W17", session_count=3, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
)
j = DigestFormatter(self.make(weeks=(w,))).to_json()
parsed = json.loads(j)
assert len(parsed["weeks"]) == 1
assert parsed["weeks"][0]["week"] == "2026-W17"
def test_with_nested_projects(self) -> None:
p = ProjectStats(
project_path="/p",
project_name="p",
session_count=5,
success_rate=0.8,
avg_tool_errors=1.0,
avg_duration_s=300.0,
top_error_categories=(),
top_friction=(),
is_outlier=False,
)
j = DigestFormatter(self.make(projects=(p,))).to_json()
parsed = json.loads(j)
assert len(parsed["projects"]) == 1
assert parsed["projects"][0]["project_name"] == "p"