mirror of
https://github.com/codeflash-ai/codeflash-agent.git
synced 2026-05-04 18:25:19 +00:00
* feat(blackbox): add package with models, CLI, and HTMX dashboard * test(blackbox): add comprehensive test coverage for dashboard * feat(blackbox): cache session scanning via watcher invalidation * docs(blackbox): add README and use fastapi[standard] for dev server * refactor(blackbox): extract presentation logic into formatter classes * refactor(blackbox): extract classify_error helpers * feat(blackbox): wire analytics into session detail view Show token usage, tool breakdowns, and session stats in a collapsible panel when viewing a session. * feat(blackbox): add codeflash plugin detection Detect codeflash agent names, skills, and commands in transcripts. Surface language, optimization domain, and capability badges in the analytics panel. * refactor(blackbox): remove underscore prefixes from internal functions * chore: add ty python-version to root pyproject.toml * chore(blackbox): fix lint errors in test files * style(blackbox): apply ruff formatting to analytics * feat(blackbox): add Playwright E2E tests for dashboard Refactor app.py to expose create_app() factory accepting a projects_dir override, enabling tests to run against fixture data instead of the real ~/.claude/projects/ directory. Routes now read projects_dir from app.state instead of the module-level constant. Add 26 Playwright tests across 5 files covering dashboard loading, session list, session detail with filters and analytics, sidebar collapse/localStorage persistence, and SSE log streaming. All tests pass on chromium, firefox, and webkit (78 total). CI gets a new e2e-blackbox job with a browser matrix strategy running all three engines in parallel, conditional on blackbox path changes, with trace upload on failure. * fix(ci): sync only blackbox package in e2e job * fix(ci): exclude e2e tests from unit test job The test job doesn't install Playwright browsers, so e2e tests error when pytest collects them. Ignore tests/e2e/ directories in the test job — those are handled by the dedicated e2e-blackbox job.
333 lines
13 KiB
Python
333 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import Any
|
|
|
|
from blackbox.formatting import (
|
|
AuditFormatter,
|
|
DigestFormatter,
|
|
MetaFormatter,
|
|
ProjectFormatter,
|
|
RecommendationFormatter,
|
|
)
|
|
from blackbox.models import (
|
|
ProjectStats,
|
|
Recommendation,
|
|
SessionAudit,
|
|
SessionDigest,
|
|
WeekStats,
|
|
)
|
|
from tests.conftest import make_meta
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# MetaFormatter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestMetaFormatter:
|
|
def test_basic(self) -> None:
|
|
text = MetaFormatter(make_meta(input_tokens=5000, output_tokens=2000, tool_errors=2)).summary()
|
|
assert "abcd1234" in text
|
|
assert "60min" in text
|
|
assert "10 user / 12 assistant" in text
|
|
assert "25 calls (2 errors)" in text
|
|
assert "5,000 in / 2,000 out" in text
|
|
|
|
def test_with_git(self) -> None:
|
|
assert "5 commits on main" in MetaFormatter(make_meta(git_commits=5, git_branch="main")).summary()
|
|
|
|
def test_git_without_branch(self) -> None:
|
|
assert "unknown" in MetaFormatter(make_meta(git_commits=1, git_branch=None)).summary()
|
|
|
|
def test_with_files(self) -> None:
|
|
text = MetaFormatter(make_meta(files_modified=3, lines_added=100, lines_removed=20)).summary()
|
|
assert "3 modified" in text
|
|
assert "+100/-20" in text
|
|
|
|
def test_without_files(self) -> None:
|
|
assert "modified" not in MetaFormatter(make_meta(files_modified=0)).summary()
|
|
|
|
def test_with_compactions(self) -> None:
|
|
assert "Compactions: 3" in MetaFormatter(make_meta(compactions=3)).summary()
|
|
|
|
def test_without_compactions(self) -> None:
|
|
assert "Compactions" not in MetaFormatter(make_meta(compactions=0)).summary()
|
|
|
|
def test_with_interruptions(self) -> None:
|
|
assert "Interruptions: 2" in MetaFormatter(make_meta(user_interruptions=2)).summary()
|
|
|
|
def test_without_interruptions(self) -> None:
|
|
assert "Interruptions" not in MetaFormatter(make_meta(user_interruptions=0)).summary()
|
|
|
|
def test_top_tools_capped_at_5(self) -> None:
|
|
meta = make_meta(tool_counts={"Read": 20, "Edit": 15, "Bash": 10, "Write": 5, "Grep": 3, "X": 1})
|
|
text = MetaFormatter(meta).summary()
|
|
assert "Read=20" in text
|
|
assert "X=1" not in text
|
|
|
|
def test_no_top_tools_when_empty(self) -> None:
|
|
assert "Top tools" not in MetaFormatter(make_meta(tool_counts={})).summary()
|
|
|
|
def test_thinking_blocks_shown_when_nonzero(self) -> None:
|
|
assert "Thinking blocks: 5" in MetaFormatter(make_meta(thinking_blocks=5)).summary()
|
|
|
|
def test_thinking_blocks_hidden_when_zero(self) -> None:
|
|
assert "Thinking blocks" not in MetaFormatter(make_meta(thinking_blocks=0)).summary()
|
|
|
|
def test_web_shown_when_nonzero(self) -> None:
|
|
text = MetaFormatter(make_meta(web_searches=3, web_fetches=1)).summary()
|
|
assert "Web: 3 searches / 1 fetches" in text
|
|
|
|
def test_web_hidden_when_zero(self) -> None:
|
|
assert "Web:" not in MetaFormatter(make_meta(web_searches=0, web_fetches=0)).summary()
|
|
|
|
def test_permission_mode_shown_when_set(self) -> None:
|
|
text = MetaFormatter(make_meta(permission_mode="bypassPermissions")).summary()
|
|
assert "Permission mode: bypassPermissions" in text
|
|
|
|
def test_permission_mode_hidden_when_none(self) -> None:
|
|
assert "Permission mode" not in MetaFormatter(make_meta(permission_mode=None)).summary()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# AuditFormatter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestAuditFormatter:
|
|
def test_basic(self) -> None:
|
|
a = SessionAudit(
|
|
session_id="abcd1234-5678",
|
|
outcome="success",
|
|
satisfaction="positive",
|
|
session_type="debugging",
|
|
)
|
|
text = AuditFormatter(a).summary()
|
|
assert "abcd1234" in text
|
|
assert "Outcome: success" in text
|
|
assert "Satisfaction: positive" in text
|
|
assert "Type: debugging" in text
|
|
|
|
def test_with_goals(self) -> None:
|
|
a = SessionAudit(session_id="x", goal_categories={"bugfix": 5, "refactor": 3})
|
|
text = AuditFormatter(a).summary()
|
|
assert "Goals:" in text
|
|
assert "bugfix(5)" in text
|
|
|
|
def test_without_goals(self) -> None:
|
|
assert "Goals" not in AuditFormatter(SessionAudit(session_id="x", goal_categories={})).summary()
|
|
|
|
def test_with_friction(self) -> None:
|
|
a = SessionAudit(session_id="x", friction_counts={"permission_denied": 4})
|
|
assert "permission_denied(4)" in AuditFormatter(a).summary()
|
|
|
|
def test_without_friction(self) -> None:
|
|
assert "Friction" not in AuditFormatter(SessionAudit(session_id="x")).summary()
|
|
|
|
def test_with_instructions(self) -> None:
|
|
a = SessionAudit(session_id="x", user_instructions=("use pytest", "no comments"))
|
|
assert "Instructions: 2 extracted" in AuditFormatter(a).summary()
|
|
|
|
def test_without_instructions(self) -> None:
|
|
assert "Instructions" not in AuditFormatter(SessionAudit(session_id="x")).summary()
|
|
|
|
def test_summary_truncated_at_120(self) -> None:
|
|
a = SessionAudit(session_id="x", summary="x" * 200)
|
|
text = AuditFormatter(a).summary()
|
|
summary_line = next(line for line in text.split("\n") if "Summary" in line)
|
|
assert len(summary_line.split("Summary: ")[1]) == 120
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# RecommendationFormatter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestRecommendationFormatter:
|
|
def test_basic(self) -> None:
|
|
r = Recommendation(suggestion="do X", evidence="50% failure", frequency=0.5, source_sessions=5)
|
|
text = RecommendationFormatter(r).summary()
|
|
assert "do X" in text
|
|
assert "50% failure" in text
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# ProjectFormatter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestProjectFormatter:
|
|
def make(self, **kw: Any) -> ProjectStats:
|
|
defaults: dict[str, Any] = {
|
|
"project_path": "/proj/myapp",
|
|
"project_name": "myapp",
|
|
"session_count": 10,
|
|
"success_rate": 0.9,
|
|
"avg_tool_errors": 2.5,
|
|
"avg_duration_s": 600.0,
|
|
"top_error_categories": (),
|
|
"top_friction": (),
|
|
}
|
|
defaults.update(kw)
|
|
return ProjectStats(**defaults)
|
|
|
|
def test_basic(self) -> None:
|
|
text = ProjectFormatter(self.make()).summary()
|
|
assert "myapp: 10 sessions" in text
|
|
assert "90% success" in text
|
|
|
|
def test_outlier_marker(self) -> None:
|
|
assert "[!]" in ProjectFormatter(self.make(is_outlier=True)).summary()
|
|
|
|
def test_error_categories_shown(self) -> None:
|
|
p = self.make(top_error_categories=(("edit_failed", 8), ("command_failed", 3)))
|
|
assert "Errors: edit_failed(8)" in ProjectFormatter(p).summary()
|
|
|
|
def test_friction_shown(self) -> None:
|
|
p = self.make(top_friction=(("user_rejected", 4),))
|
|
assert "Friction: user_rejected(4)" in ProjectFormatter(p).summary()
|
|
|
|
def test_no_sub_lines_when_clean(self) -> None:
|
|
text = ProjectFormatter(self.make()).summary()
|
|
assert len(text.strip().split("\n")) == 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# DigestFormatter
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestDigestFormatter:
|
|
def make(self, **kw: Any) -> SessionDigest:
|
|
defaults: dict[str, Any] = {"session_count": 10, "date_range": (100.0, 500.0), "success_rate": 0.8}
|
|
defaults.update(kw)
|
|
return SessionDigest(**defaults)
|
|
|
|
def test_includes_count(self) -> None:
|
|
assert "42 sessions" in DigestFormatter(self.make(session_count=42)).summary()
|
|
|
|
def test_success_rate(self) -> None:
|
|
assert "80% success rate" in DigestFormatter(self.make(success_rate=0.8)).summary()
|
|
|
|
def test_outcome_distribution(self) -> None:
|
|
digest = self.make(
|
|
session_count=10,
|
|
outcome_distribution={"fully_achieved": 7, "unclear": 3},
|
|
)
|
|
text = DigestFormatter(digest).summary()
|
|
assert "fully_achieved: 7 (70%)" in text
|
|
|
|
def test_no_trends_without_weeks(self) -> None:
|
|
assert "Trends" not in DigestFormatter(self.make()).summary()
|
|
|
|
def test_trends_with_weeks(self) -> None:
|
|
w = WeekStats(
|
|
week="2026-W17", session_count=5, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
|
|
)
|
|
digest = self.make(weeks=(w,), rolling_success_rate=0.7)
|
|
text = DigestFormatter(digest).summary()
|
|
assert "Trends" in text
|
|
assert "2026-W17" in text
|
|
|
|
def test_no_projects_without_data(self) -> None:
|
|
assert "Projects" not in DigestFormatter(self.make()).summary()
|
|
|
|
def test_no_recommendations_without_data(self) -> None:
|
|
assert "Recommendations" not in DigestFormatter(self.make()).summary()
|
|
|
|
def test_with_recommendations(self) -> None:
|
|
r = Recommendation(suggestion="Fix the thing", evidence="50% failure", frequency=0.5, source_sessions=10)
|
|
text = DigestFormatter(self.make(recommendations=(r,))).summary()
|
|
assert "Recommendations" in text
|
|
assert "1. Fix the thing" in text
|
|
|
|
def test_satisfaction_distribution(self) -> None:
|
|
digest = self.make(
|
|
session_count=10,
|
|
satisfaction_distribution={"happy": 6, "neutral": 4},
|
|
)
|
|
text = DigestFormatter(digest).summary()
|
|
assert "Satisfaction:" in text
|
|
assert "happy: 6" in text
|
|
|
|
def test_top_friction(self) -> None:
|
|
digest = self.make(top_friction=(("tool_failed", 12), ("blocked", 3)))
|
|
text = DigestFormatter(digest).summary()
|
|
assert "Top friction:" in text
|
|
assert "tool_failed: 12" in text
|
|
|
|
def test_sparkline_with_two_weeks(self) -> None:
|
|
w1 = WeekStats(
|
|
week="2026-W16", session_count=3, success_rate=0.5, avg_errors_per_session=2.0, avg_duration_s=600.0
|
|
)
|
|
w2 = WeekStats(
|
|
week="2026-W17", session_count=4, success_rate=0.9, avg_errors_per_session=0.5, avg_duration_s=400.0
|
|
)
|
|
text = DigestFormatter(self.make(weeks=(w1, w2), rolling_success_rate=0.7)).summary()
|
|
assert "Success: [" in text
|
|
assert "Errors: [" in text
|
|
|
|
def test_error_category_deltas(self) -> None:
|
|
w = WeekStats(
|
|
week="2026-W17", session_count=3, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
|
|
)
|
|
digest = self.make(
|
|
weeks=(w,),
|
|
error_category_deltas=(("command_failed", 0.5, 4.0, 6.0),),
|
|
)
|
|
text = DigestFormatter(digest).summary()
|
|
assert "Error category trends:" in text
|
|
assert "command_failed" in text
|
|
|
|
def test_with_projects(self) -> None:
|
|
p = ProjectStats(
|
|
project_path="/proj/myapp",
|
|
project_name="myapp",
|
|
session_count=5,
|
|
success_rate=0.8,
|
|
avg_tool_errors=1.0,
|
|
avg_duration_s=300.0,
|
|
top_error_categories=(),
|
|
top_friction=(),
|
|
)
|
|
text = DigestFormatter(self.make(projects=(p,))).summary()
|
|
assert "Projects (1)" in text
|
|
assert "myapp" in text
|
|
|
|
|
|
class TestDigestToJson:
|
|
def make(self, **kw: Any) -> SessionDigest:
|
|
defaults: dict[str, Any] = {"session_count": 10, "date_range": (100.0, 500.0), "success_rate": 0.8}
|
|
defaults.update(kw)
|
|
return SessionDigest(**defaults)
|
|
|
|
def test_valid_json(self) -> None:
|
|
j = DigestFormatter(self.make(session_count=5)).to_json()
|
|
parsed = json.loads(j)
|
|
assert parsed["session_count"] == 5
|
|
|
|
def test_with_nested_weeks(self) -> None:
|
|
w = WeekStats(
|
|
week="2026-W17", session_count=3, success_rate=0.7, avg_errors_per_session=1.0, avg_duration_s=600.0
|
|
)
|
|
j = DigestFormatter(self.make(weeks=(w,))).to_json()
|
|
parsed = json.loads(j)
|
|
assert len(parsed["weeks"]) == 1
|
|
assert parsed["weeks"][0]["week"] == "2026-W17"
|
|
|
|
def test_with_nested_projects(self) -> None:
|
|
p = ProjectStats(
|
|
project_path="/p",
|
|
project_name="p",
|
|
session_count=5,
|
|
success_rate=0.8,
|
|
avg_tool_errors=1.0,
|
|
avg_duration_s=300.0,
|
|
top_error_categories=(),
|
|
top_friction=(),
|
|
is_outlier=False,
|
|
)
|
|
j = DigestFormatter(self.make(projects=(p,))).to_json()
|
|
parsed = json.loads(j)
|
|
assert len(parsed["projects"]) == 1
|
|
assert parsed["projects"][0]["project_name"] == "p"
|