From d02f4a156454b28948305bb6e0f56d18ec330db9 Mon Sep 17 00:00:00 2001 From: Kevin Turcios Date: Sat, 14 Feb 2026 22:25:30 -0500 Subject: [PATCH] test: add evals for all three Tessl tiles --- .../evals/capabilities.json | 104 ++++++++++++++++++ .../evals/scenario-1/capability.txt | 1 + .../evals/scenario-1/criteria.json | 26 +++++ .../evals/scenario-1/task.md | 31 ++++++ .../evals/scenario-2/capability.txt | 1 + .../evals/scenario-2/criteria.json | 31 ++++++ .../evals/scenario-2/task.md | 34 ++++++ .../evals/scenario-3/capability.txt | 1 + .../evals/scenario-3/criteria.json | 26 +++++ .../evals/scenario-3/task.md | 38 +++++++ .../evals/scenario-4/capability.txt | 1 + .../evals/scenario-4/criteria.json | 26 +++++ .../evals/scenario-4/task.md | 31 ++++++ .../evals/scenario-5/capability.txt | 1 + .../evals/scenario-5/criteria.json | 26 +++++ .../evals/scenario-5/task.md | 39 +++++++ .../evals/summary.json | 40 +++++++ .../evals/summary_infeasible.json | 14 +++ .../evals/capabilities.json | 97 ++++++++++++++++ .../evals/scenario-1/capability.txt | 1 + .../evals/scenario-1/criteria.json | 31 ++++++ .../evals/scenario-1/task.md | 38 +++++++ .../evals/scenario-2/capability.txt | 1 + .../evals/scenario-2/criteria.json | 26 +++++ .../evals/scenario-2/task.md | 35 ++++++ .../evals/scenario-3/capability.txt | 1 + .../evals/scenario-3/criteria.json | 26 +++++ .../evals/scenario-3/task.md | 33 ++++++ .../evals/scenario-4/capability.txt | 1 + .../evals/scenario-4/criteria.json | 31 ++++++ .../evals/scenario-4/task.md | 82 ++++++++++++++ .../evals/scenario-5/capability.txt | 1 + .../evals/scenario-5/criteria.json | 26 +++++ .../evals/scenario-5/task.md | 43 ++++++++ .../evals/summary.json | 70 ++++++++++++ .../evals/summary_infeasible.json | 5 + .../evals/capabilities.json | 97 ++++++++++++++++ .../evals/scenario-1/capability.txt | 1 + .../evals/scenario-1/criteria.json | 26 +++++ .../evals/scenario-1/task.md | 38 +++++++ .../evals/scenario-2/capability.txt | 1 + .../evals/scenario-2/criteria.json | 26 +++++ .../evals/scenario-2/task.md | 44 ++++++++ .../evals/scenario-3/capability.txt | 1 + .../evals/scenario-3/criteria.json | 31 ++++++ .../evals/scenario-3/task.md | 30 +++++ .../evals/scenario-4/capability.txt | 1 + .../evals/scenario-4/criteria.json | 31 ++++++ .../evals/scenario-4/task.md | 38 +++++++ .../evals/scenario-5/capability.txt | 1 + .../evals/scenario-5/criteria.json | 26 +++++ .../evals/scenario-5/task.md | 52 +++++++++ .../evals/summary.json | 47 ++++++++ .../evals/summary_infeasible.json | 21 ++++ 54 files changed, 1531 insertions(+) create mode 100644 tiles/codeflash-internal-docs/evals/capabilities.json create mode 100644 tiles/codeflash-internal-docs/evals/scenario-1/capability.txt create mode 100644 tiles/codeflash-internal-docs/evals/scenario-1/criteria.json create mode 100644 tiles/codeflash-internal-docs/evals/scenario-1/task.md create mode 100644 tiles/codeflash-internal-docs/evals/scenario-2/capability.txt create mode 100644 tiles/codeflash-internal-docs/evals/scenario-2/criteria.json create mode 100644 tiles/codeflash-internal-docs/evals/scenario-2/task.md create mode 100644 tiles/codeflash-internal-docs/evals/scenario-3/capability.txt create mode 100644 tiles/codeflash-internal-docs/evals/scenario-3/criteria.json create mode 100644 tiles/codeflash-internal-docs/evals/scenario-3/task.md create mode 100644 tiles/codeflash-internal-docs/evals/scenario-4/capability.txt create mode 100644 tiles/codeflash-internal-docs/evals/scenario-4/criteria.json create mode 100644 tiles/codeflash-internal-docs/evals/scenario-4/task.md create mode 100644 tiles/codeflash-internal-docs/evals/scenario-5/capability.txt create mode 100644 tiles/codeflash-internal-docs/evals/scenario-5/criteria.json create mode 100644 tiles/codeflash-internal-docs/evals/scenario-5/task.md create mode 100644 tiles/codeflash-internal-docs/evals/summary.json create mode 100644 tiles/codeflash-internal-docs/evals/summary_infeasible.json create mode 100644 tiles/codeflash-internal-rules/evals/capabilities.json create mode 100644 tiles/codeflash-internal-rules/evals/scenario-1/capability.txt create mode 100644 tiles/codeflash-internal-rules/evals/scenario-1/criteria.json create mode 100644 tiles/codeflash-internal-rules/evals/scenario-1/task.md create mode 100644 tiles/codeflash-internal-rules/evals/scenario-2/capability.txt create mode 100644 tiles/codeflash-internal-rules/evals/scenario-2/criteria.json create mode 100644 tiles/codeflash-internal-rules/evals/scenario-2/task.md create mode 100644 tiles/codeflash-internal-rules/evals/scenario-3/capability.txt create mode 100644 tiles/codeflash-internal-rules/evals/scenario-3/criteria.json create mode 100644 tiles/codeflash-internal-rules/evals/scenario-3/task.md create mode 100644 tiles/codeflash-internal-rules/evals/scenario-4/capability.txt create mode 100644 tiles/codeflash-internal-rules/evals/scenario-4/criteria.json create mode 100644 tiles/codeflash-internal-rules/evals/scenario-4/task.md create mode 100644 tiles/codeflash-internal-rules/evals/scenario-5/capability.txt create mode 100644 tiles/codeflash-internal-rules/evals/scenario-5/criteria.json create mode 100644 tiles/codeflash-internal-rules/evals/scenario-5/task.md create mode 100644 tiles/codeflash-internal-rules/evals/summary.json create mode 100644 tiles/codeflash-internal-rules/evals/summary_infeasible.json create mode 100644 tiles/codeflash-internal-skills/evals/capabilities.json create mode 100644 tiles/codeflash-internal-skills/evals/scenario-1/capability.txt create mode 100644 tiles/codeflash-internal-skills/evals/scenario-1/criteria.json create mode 100644 tiles/codeflash-internal-skills/evals/scenario-1/task.md create mode 100644 tiles/codeflash-internal-skills/evals/scenario-2/capability.txt create mode 100644 tiles/codeflash-internal-skills/evals/scenario-2/criteria.json create mode 100644 tiles/codeflash-internal-skills/evals/scenario-2/task.md create mode 100644 tiles/codeflash-internal-skills/evals/scenario-3/capability.txt create mode 100644 tiles/codeflash-internal-skills/evals/scenario-3/criteria.json create mode 100644 tiles/codeflash-internal-skills/evals/scenario-3/task.md create mode 100644 tiles/codeflash-internal-skills/evals/scenario-4/capability.txt create mode 100644 tiles/codeflash-internal-skills/evals/scenario-4/criteria.json create mode 100644 tiles/codeflash-internal-skills/evals/scenario-4/task.md create mode 100644 tiles/codeflash-internal-skills/evals/scenario-5/capability.txt create mode 100644 tiles/codeflash-internal-skills/evals/scenario-5/criteria.json create mode 100644 tiles/codeflash-internal-skills/evals/scenario-5/task.md create mode 100644 tiles/codeflash-internal-skills/evals/summary.json create mode 100644 tiles/codeflash-internal-skills/evals/summary_infeasible.json diff --git a/tiles/codeflash-internal-docs/evals/capabilities.json b/tiles/codeflash-internal-docs/evals/capabilities.json new file mode 100644 index 000000000..1d84df120 --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/capabilities.json @@ -0,0 +1,104 @@ +{ + "package_name": "codeflash-internal-docs", + "total_capabilities": 14, + "capabilities": [ + { + "id": 0, + "name": "optimize-schema-structure", + "description": "Understands OptimizeSchema and OptimizeSchemaLP request schemas, their fields, defaults, and the difference between them (e.g., line_profiler_results, different n_candidates defaults).", + "complexity": "basic", + "api_elements": ["OptimizeSchema", "OptimizeSchemaLP", "n_candidates", "line_profiler_results", "source_code", "trace_id"] + }, + { + "id": 1, + "name": "domain-models-relationships", + "description": "Understands OptimizationFeatures, OptimizationEvents, and Repositories Django models, their fields and purposes (trace_id PK, approval workflow, PR events, usage tracking).", + "complexity": "basic", + "api_elements": ["OptimizationFeatures", "OptimizationEvents", "Repositories", "trace_id", "approval_status", "llm_cost", "optimizations_limit"] + }, + { + "id": 2, + "name": "optimization-pipeline-flow", + "description": "Understands the 6-step optimization pipeline: request entry, language dispatch, context extraction, LLM calls, postprocessing, response assembly.", + "complexity": "intermediate", + "api_elements": ["optimize_api", "optimize_python", "optimize_python_code", "optimize_python_code_single", "asyncio.TaskGroup"] + }, + { + "id": 3, + "name": "language-dispatch-routing", + "description": "Understands how both optimize and testgen routers dispatch by data.language to JavaScript/TypeScript, Java, or Python handlers, and that imports are lazy.", + "complexity": "basic", + "api_elements": ["optimize_api", "testgen_api", "data.language", "optimize_javascript", "optimize_java", "optimize_python"] + }, + { + "id": 4, + "name": "context-extraction-types", + "description": "Understands SingleOptimizerContext vs MultiOptimizerContext, the factory method get_dynamic_context(), prompt construction methods, and markdown code block format with file path annotations.", + "complexity": "intermediate", + "api_elements": ["SingleOptimizerContext", "MultiOptimizerContext", "BaseOptimizerContext", "get_dynamic_context", "get_system_prompt", "get_user_prompt", "extract_code_and_explanation_from_llm_res"] + }, + { + "id": 5, + "name": "llm-response-parsing", + "description": "Understands how LLM responses are parsed: extract_code_and_explanation_from_llm_res for markdown code blocks, parse_and_generate_candidate_schema for schema conversion, and is_valid_code for syntax validation.", + "complexity": "intermediate", + "api_elements": ["extract_code_and_explanation_from_llm_res", "parse_and_generate_candidate_schema", "is_valid_code", "OptimizeResponseItemSchema"] + }, + { + "id": 6, + "name": "model-distribution-formula", + "description": "Knows the model distribution formula: claude_calls = (total - 1) // 2, gpt_calls = total - claude_calls. Knows MAX_OPTIMIZER_CALLS=6, MAX_OPTIMIZER_LP_CALLS=7, and concrete example outputs.", + "complexity": "intermediate", + "api_elements": ["get_model_distribution", "MAX_OPTIMIZER_CALLS", "MAX_OPTIMIZER_LP_CALLS", "claude_calls", "gpt_calls"] + }, + { + "id": 7, + "name": "postprocessing-dedup", + "description": "Understands AST-based deduplication using ast.parse() + ast.dump(), equality_check() for filtering identical candidates, and libcst for code transformations.", + "complexity": "intermediate", + "api_elements": ["deduplicate_optimizations", "equality_check", "ast.parse", "ast.dump", "libcst"] + }, + { + "id": 8, + "name": "aiservice-endpoint-map", + "description": "Knows all 12 Django-Ninja endpoints, their paths, API names, and modules. Understands common patterns: async def, AuthenticatedRequest, response schemas with status codes.", + "complexity": "basic", + "api_elements": ["/ai/optimize", "/ai/testgen", "/ai/refinement", "/ai/code_repair", "/ai/adaptive_optimize", "/ai/rewrite_jit", "AuthenticatedRequest"] + }, + { + "id": 9, + "name": "cf-api-route-ordering", + "description": "Understands that webhook routes must be registered before the body parser for raw body signature verification, the 4-phase registration order, and the middleware stack (checkForValidAPIKey, trackEndpointCalls, idLimiter, trackUsage).", + "complexity": "intermediate", + "api_elements": ["webhook.routes.ts", "express.json", "checkForValidAPIKey", "trackEndpointCalls", "idLimiter", "trackUsage"] + }, + { + "id": 10, + "name": "llm-dataclass-and-call", + "description": "Understands the LLM pydantic dataclass (name, max_tokens, model_type, cost fields), call_llm() function signature and its parameters, and the LLMResponse/LLMUsage types.", + "complexity": "intermediate", + "api_elements": ["LLM", "call_llm", "LLMResponse", "LLMUsage", "model_type", "input_cost", "output_cost"] + }, + { + "id": 11, + "name": "llm-client-setup", + "description": "Understands how LLM clients are created: AsyncAzureOpenAI for OpenAI (AZURE_OPENAI_* env vars), AsyncAnthropicFoundry for Anthropic (ANTHROPIC_FOUNDRY_* env vars), and that fresh clients are created per request.", + "complexity": "advanced", + "api_elements": ["_create_openai_client", "_create_anthropic_client", "get_llm_client", "AsyncAzureOpenAI", "AsyncAnthropicFoundry"] + }, + { + "id": 12, + "name": "llm-cost-calculation", + "description": "Understands calculate_llm_cost() and the difference between OpenAI and Anthropic cached token accounting: Anthropic's cache tokens are additive to input_tokens, OpenAI's cached_tokens is a subset of prompt_tokens.", + "complexity": "advanced", + "api_elements": ["calculate_llm_cost", "cache_read_input_tokens", "cache_creation_input_tokens", "cached_tokens", "prompt_tokens"] + }, + { + "id": 13, + "name": "testgen-instrumentation", + "description": "Understands the test generation pipeline: build_prompt (Jinja2), instrument_tests (behavior + performance), framework detection for GPU sync, device sync precompute, and LLMOutputParseError.", + "complexity": "advanced", + "api_elements": ["build_prompt", "instrument_tests", "detect_frameworks_from_code", "_create_device_sync_precompute_statements", "LLMOutputParseError"] + } + ] +} diff --git a/tiles/codeflash-internal-docs/evals/scenario-1/capability.txt b/tiles/codeflash-internal-docs/evals/scenario-1/capability.txt new file mode 100644 index 000000000..7f6ae3454 --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-1/capability.txt @@ -0,0 +1 @@ +Model distribution formula and LLM cost calculation with provider-specific cached token accounting. diff --git a/tiles/codeflash-internal-docs/evals/scenario-1/criteria.json b/tiles/codeflash-internal-docs/evals/scenario-1/criteria.json new file mode 100644 index 000000000..5d9a65181 --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-1/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent understands the model distribution formula, LLM cost calculation differences between OpenAI and Anthropic, and the MAX_OPTIMIZER_CALLS constant.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Distribution formula correct", + "description": "get_model_distribution uses claude_calls = (total - 1) // 2 and gpt_calls = total - claude_calls, and total = min(n_candidates, max_calls). For n_candidates=5, max_calls=6: returns 3 OpenAI + 2 Anthropic. For n_candidates=6, max_calls=6: returns 4 OpenAI + 2 Anthropic.", + "max_score": 30 + }, + { + "name": "OpenAI cached token accounting", + "description": "For OpenAI, cached_input_tokens is treated as a subset of input_tokens. Non-cached input = input_tokens - cached_input_tokens. Cost uses both rates correctly with GPT-5-mini pricing ($0.25 input, $0.03 cached, $2.00 output per 1M tokens).", + "max_score": 25 + }, + { + "name": "Anthropic cached token accounting", + "description": "For Anthropic, cached_input_tokens is additive to input_tokens (separate count). Cost uses Claude Sonnet 4.5 pricing ($3.00 input, $15.00 output per 1M tokens).", + "max_score": 25 + }, + { + "name": "Full run estimation uses MAX_OPTIMIZER_CALLS=6", + "description": "estimate_full_run_cost uses MAX_OPTIMIZER_CALLS = 6 as the max_calls parameter and correctly multiplies per-call costs by the call count for each provider.", + "max_score": 20 + } + ] +} diff --git a/tiles/codeflash-internal-docs/evals/scenario-1/task.md b/tiles/codeflash-internal-docs/evals/scenario-1/task.md new file mode 100644 index 000000000..59055eba8 --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-1/task.md @@ -0,0 +1,31 @@ +# Task: Implement a Model Distribution Calculator + +## Context + +You are working on the codeflash-internal aiservice backend. The optimization pipeline distributes LLM calls across OpenAI and Anthropic models in parallel. The distribution logic lives in `core/shared/optimizer_config.py`. + +You need to write a Python function that replicates the model distribution logic, and a second function that calculates the total estimated LLM cost for an optimization run given usage data. + +## Task + +1. Write a function `get_model_distribution(n_candidates: int, max_calls: int) -> list[tuple[str, int]]` that: + - Takes the number of requested candidates and the maximum allowed parallel calls + - Computes `total = min(n_candidates, max_calls)` + - Splits between OpenAI and Anthropic using the formula: `claude_calls = (total - 1) // 2`, `gpt_calls = total - claude_calls` + - Returns a list of `(model_name, call_count)` tuples, using `"openai"` and `"anthropic"` as model names + +2. Write a function `calculate_optimization_cost(input_tokens: int, output_tokens: int, cached_input_tokens: int, provider: str) -> float` that: + - Computes the cost in USD given token counts + - For the `"openai"` provider: `cached_input_tokens` is a **subset** of `input_tokens`, so non-cached = `input_tokens - cached_input_tokens`. Use GPT-5-mini pricing: $0.25 input, $0.03 cached input, $2.00 output per 1M tokens. + - For the `"anthropic"` provider: `cached_input_tokens` is **additive** to `input_tokens` (they are separate). Use Claude Sonnet 4.5 pricing: $3.00 input, $15.00 output per 1M tokens (no cached discount). + +3. Write a function `estimate_full_run_cost(n_candidates: int, avg_input_tokens: int, avg_output_tokens: int, avg_cached_tokens: int) -> float` that: + - Uses `get_model_distribution` with `MAX_OPTIMIZER_CALLS = 6` + - For each provider's call count, calculates cost using `calculate_optimization_cost` + - Returns total estimated cost + +## Expected Outputs + +- A Python module with all three functions +- The distribution for `n_candidates=5, max_calls=6` should produce 3 OpenAI + 2 Anthropic calls +- The distribution for `n_candidates=6, max_calls=6` should produce 4 OpenAI + 2 Anthropic calls diff --git a/tiles/codeflash-internal-docs/evals/scenario-2/capability.txt b/tiles/codeflash-internal-docs/evals/scenario-2/capability.txt new file mode 100644 index 000000000..f71aee810 --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-2/capability.txt @@ -0,0 +1 @@ +Language dispatch routing pattern and aiservice endpoint registration conventions. diff --git a/tiles/codeflash-internal-docs/evals/scenario-2/criteria.json b/tiles/codeflash-internal-docs/evals/scenario-2/criteria.json new file mode 100644 index 000000000..75378869b --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-2/criteria.json @@ -0,0 +1,31 @@ +{ + "context": "Tests whether the agent understands the language dispatch pattern, endpoint registration conventions, and Django-Ninja schema patterns from the aiservice endpoints and optimization pipeline docs.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Language dispatch pattern", + "description": "The router dispatches by data.language with explicit checks for 'javascript'/'typescript', 'java', and a default for Python. All three branches are present.", + "max_score": 25 + }, + { + "name": "Lazy imports", + "description": "Language-specific handler imports are inside the function body (not at module top level) to avoid circular dependencies, matching the existing optimize_api and testgen_api pattern.", + "max_score": 20 + }, + { + "name": "Async endpoint with auth", + "description": "The endpoint handler is declared as async def and uses AuthenticatedRequest (HttpBearer + django_auth) for authentication. Response schema uses status code mapping: {200: SuccessSchema, 400: ErrorSchema, 500: ErrorSchema}.", + "max_score": 25 + }, + { + "name": "Schema follows conventions", + "description": "CodeReviewSchema inherits from ninja.Schema, includes source_code (str), trace_id (str), language (str, default 'python'), and any additional fields use proper typing.", + "max_score": 15 + }, + { + "name": "URL registration", + "description": "Shows registration in urls.py using api.add_router or urlpatterns with the path '/ai/code_review' and the code_review_api instance, consistent with existing endpoint registrations.", + "max_score": 15 + } + ] +} diff --git a/tiles/codeflash-internal-docs/evals/scenario-2/task.md b/tiles/codeflash-internal-docs/evals/scenario-2/task.md new file mode 100644 index 000000000..abb1a3bbb --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-2/task.md @@ -0,0 +1,34 @@ +# Task: Build a Language-Dispatching Router for a New Endpoint + +## Context + +You are adding a new `code_review` endpoint to the codeflash-internal aiservice backend. The existing optimization and testgen endpoints both follow the same pattern: a Django-Ninja router that dispatches to language-specific handlers based on a `data.language` field. Imports are kept lazy (inside the function body) to avoid circular dependencies. + +The endpoint should be registered at `/ai/code_review` and follow the same conventions as the existing 12 endpoints in `aiservice/urls.py`. + +## Task + +1. Write a Django-Ninja router module `code_review_router.py` that: + - Creates a `NinjaAPI` instance named `code_review_api` + - Has a single `POST /` endpoint that is `async def` + - Accepts an `AuthenticatedRequest` and a request body schema + - Dispatches by `data.language`: + - `"javascript"` or `"typescript"` calls a JS/TS handler + - `"java"` calls a Java handler + - Default (Python) calls a Python handler + - Uses lazy imports inside the function body + - Returns typed response schemas: `response={200: SuccessSchema, 400: ErrorSchema, 500: ErrorSchema}` + +2. Write a request schema `CodeReviewSchema` (inheriting from `ninja.Schema`) that includes: + - `source_code: str` + - `trace_id: str` + - `language: str` with a default of `"python"` + - `review_depth: str` with a default of `"standard"` + +3. Show how this endpoint would be registered in `aiservice/urls.py` alongside the existing endpoints. + +## Expected Outputs + +- A `code_review_router.py` module following the codeflash conventions +- A `CodeReviewSchema` class +- A URL registration snippet diff --git a/tiles/codeflash-internal-docs/evals/scenario-3/capability.txt b/tiles/codeflash-internal-docs/evals/scenario-3/capability.txt new file mode 100644 index 000000000..fa4fe883d --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-3/capability.txt @@ -0,0 +1 @@ +LLM response parsing for annotated markdown code blocks and AST-based candidate deduplication. diff --git a/tiles/codeflash-internal-docs/evals/scenario-3/criteria.json b/tiles/codeflash-internal-docs/evals/scenario-3/criteria.json new file mode 100644 index 000000000..1525c461c --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-3/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent understands the markdown code block format with file path annotations, AST-based deduplication, equality checking, and code validation from the context extraction and postprocessing docs.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "File-path-annotated code block parsing", + "description": "extract_code_blocks correctly parses the ```python:path/to/file.py format, extracting both the file path and language from the annotation. Handles both annotated and plain code blocks.", + "max_score": 30 + }, + { + "name": "AST-based deduplication", + "description": "deduplicate_candidates uses ast.parse() and ast.dump() to normalize code before comparison, rather than string equality. This ensures whitespace/comment differences are ignored.", + "max_score": 30 + }, + { + "name": "Equality check against original", + "description": "deduplicate_candidates filters out candidates whose AST is identical to the original_code's AST, ensuring only genuinely different optimizations are returned.", + "max_score": 20 + }, + { + "name": "Graceful SyntaxError handling", + "description": "Both deduplicate_candidates and validate_python_code handle SyntaxError from ast.parse() gracefully. deduplicate_candidates keeps unparseable candidates rather than discarding them.", + "max_score": 20 + } + ] +} diff --git a/tiles/codeflash-internal-docs/evals/scenario-3/task.md b/tiles/codeflash-internal-docs/evals/scenario-3/task.md new file mode 100644 index 000000000..c10741bbb --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-3/task.md @@ -0,0 +1,38 @@ +# Task: Implement an LLM Response Parser for Optimization Candidates + +## Context + +The codeflash-internal optimization pipeline receives LLM responses as markdown text containing code blocks with file path annotations. The context extraction system in `optimizer_context.py` parses these responses to extract optimized code candidates. The format uses annotated markdown code blocks like: + +``` +\`\`\`python:path/to/file.py +# optimized code here +\`\`\` +``` + +After extraction, candidates go through postprocessing: AST-based deduplication (using `ast.parse()` + `ast.dump()`) and equality checking against the original code. + +## Task + +1. Write a function `extract_code_blocks(llm_response: str) -> list[dict]` that: + - Parses markdown code blocks from an LLM response string + - Handles the file-path-annotated format: `` ```python:path/to/file.py `` + - Returns a list of dicts, each with keys: `"code"` (str), `"file_path"` (str or None), `"language"` (str) + - Handles both annotated (with file path) and plain code blocks + +2. Write a function `deduplicate_candidates(candidates: list[str], original_code: str) -> list[str]` that: + - Removes duplicate candidates using AST-based comparison (`ast.parse()` + `ast.dump()`) + - Filters out candidates that are identical to the original code (equality check) + - Returns only unique, non-original candidates + - Handles `SyntaxError` gracefully (keep candidates that fail to parse, as they might use features beyond basic AST) + +3. Write a function `validate_python_code(code: str) -> bool` that: + - Checks if the code is syntactically valid Python + - Returns True if `ast.parse()` succeeds, False otherwise + +## Expected Outputs + +- A Python module with all three functions +- `extract_code_blocks` should correctly parse multi-block responses +- `deduplicate_candidates` should use AST normalization, not string comparison +- The module should import only `ast` and `re` from the standard library diff --git a/tiles/codeflash-internal-docs/evals/scenario-4/capability.txt b/tiles/codeflash-internal-docs/evals/scenario-4/capability.txt new file mode 100644 index 000000000..fe084501d --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-4/capability.txt @@ -0,0 +1 @@ +CF-API route registration ordering and middleware stack with webhook-before-body-parser requirement. diff --git a/tiles/codeflash-internal-docs/evals/scenario-4/criteria.json b/tiles/codeflash-internal-docs/evals/scenario-4/criteria.json new file mode 100644 index 000000000..288734393 --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-4/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent understands cf-api route registration ordering, the webhook-before-body-parser requirement, and the middleware stack from the cf-api-endpoints docs.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Four-phase registration order", + "description": "routes/index.ts registers routes in the correct order: (1) webhook routes, (2) body parser, (3) public routes, (4) protected routes. Webhook routes are explicitly before express.json().", + "max_score": 30 + }, + { + "name": "Webhook raw body explanation", + "description": "A comment or explanation states that webhook routes need raw body access for signature verification (e.g., GitHub Octokit signature verification, Stripe webhook verification), and the JSON body parser would consume/parse the raw body before the webhook handler can access it.", + "max_score": 20 + }, + { + "name": "Protected route middleware", + "description": "Deployment routes are registered in the protected phase (after body parser, with checkForValidAPIKey middleware). Routes use trackEndpointCalls and trackUsage middleware.", + "max_score": 25 + }, + { + "name": "Deployment routes structure", + "description": "deployment.routes.ts exports a Router with POST /deploy/trigger, POST /deploy/status, and POST /deploy/rollback endpoints.", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-internal-docs/evals/scenario-4/task.md b/tiles/codeflash-internal-docs/evals/scenario-4/task.md new file mode 100644 index 000000000..aa6c688ab --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-4/task.md @@ -0,0 +1,31 @@ +# Task: Write an Express Route Registration Module with Correct Middleware Ordering + +## Context + +The codeflash-internal cf-api is an Express server where route registration order is critical. Webhook routes must be registered **before** the JSON body parser because they need raw body access for signature verification. Protected routes require API key authentication via middleware. + +You are adding a new set of routes for a "deployment" feature and need to integrate them into the existing route registration order. + +## Task + +1. Write a TypeScript route registration module `routes/index.ts` that demonstrates the correct ordering: + - Phase 1: Webhook routes (before body parser) -- include GitHub and Stripe webhook handlers + - Phase 2: Body parser with `express.json({ limit: JSON_BODY_LIMIT })` + - Phase 3: Public routes (no auth) + - Phase 4: Protected routes with `checkForValidAPIKey` middleware + +2. Write a new `deployment.routes.ts` module that: + - Has `POST /deploy/trigger` -- triggers a deployment + - Has `POST /deploy/status` -- checks deployment status + - Has `POST /deploy/rollback` -- rolls back a deployment + - All routes are protected (require API key) + - Uses `trackEndpointCalls` middleware for PostHog tracking + - Uses `trackUsage` middleware for usage tracking + +3. Explain in a code comment why the webhook routes must come before `express.json()` and what would break if they didn't. + +## Expected Outputs + +- A `routes/index.ts` showing the 4-phase registration with the new deployment routes in the correct phase +- A `deployment.routes.ts` with the three endpoints and proper middleware +- A comment explaining the webhook ordering requirement diff --git a/tiles/codeflash-internal-docs/evals/scenario-5/capability.txt b/tiles/codeflash-internal-docs/evals/scenario-5/capability.txt new file mode 100644 index 000000000..685bdfe8b --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-5/capability.txt @@ -0,0 +1 @@ +LLM provider abstraction with provider-specific handling, observability patterns, and testgen framework detection. diff --git a/tiles/codeflash-internal-docs/evals/scenario-5/criteria.json b/tiles/codeflash-internal-docs/evals/scenario-5/criteria.json new file mode 100644 index 000000000..e565370de --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-5/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent understands the LLM provider abstraction (dataclass, client setup, provider-specific handling), observability patterns, and test generation framework detection from the llm-provider-abstraction and test-generation-pipeline docs.", + "type": "weighted_checklist", + "checklist": [ + { + "name": "LLM dataclass and model instances", + "description": "LLM is a pydantic_dataclass with all 6 fields (name, max_tokens, model_type as Literal, input_cost, cached_input_cost, output_cost). Concrete instances use correct pricing from the docs.", + "max_score": 20 + }, + { + "name": "Provider-specific call handling", + "description": "call_llm dispatches by model_type. OpenAI uses client.chat.completions.create() with max_completion_tokens for GPT-5-mini and max_tokens for older models. Anthropic extracts system prompt from messages and passes via system= kwarg, and concatenates text blocks from the response.", + "max_score": 30 + }, + { + "name": "Observability in finally block", + "description": "record_llm_call() is called in a finally block (not just after success), ensuring every LLM call is recorded to the database regardless of success or failure. Includes trace_id, call_type, model, cost, and latency.", + "max_score": 25 + }, + { + "name": "Framework detection", + "description": "detect_frameworks_from_code parses import statements to identify PyTorch, TensorFlow, and JAX. Handles both 'import torch' and aliased imports like 'import tensorflow as tf'.", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-internal-docs/evals/scenario-5/task.md b/tiles/codeflash-internal-docs/evals/scenario-5/task.md new file mode 100644 index 000000000..466874eaa --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/scenario-5/task.md @@ -0,0 +1,39 @@ +# Task: Build an LLM Client Wrapper with Provider-Specific Handling + +## Context + +The codeflash-internal aiservice uses a unified LLM abstraction in `aiservice/llm.py`. All LLM calls go through a single `call_llm()` function that handles both OpenAI (via Azure) and Anthropic (via Foundry) providers. Each provider has different client setup, message handling, and response parsing. + +The test generation pipeline also uses LLM calls, adding framework detection for GPU sync in timing blocks and Jinja2-based prompt construction. + +## Task + +1. Write an `LLM` dataclass (using `pydantic_dataclass`) with fields: + - `name: str` -- deployment name + - `max_tokens: int` -- max context window + - `model_type: Literal["openai", "anthropic", "google"]` + - `input_cost: float` -- USD per 1M tokens + - `cached_input_cost: float` -- USD per 1M cached tokens + - `output_cost: float` -- USD per 1M tokens + +2. Define concrete model instances: + - `OpenAI_GPT_5_Mini` with pricing: $0.25 input, $0.03 cached, $2.00 output + - `Anthropic_Claude_Sonnet_4_5` with pricing: $3.00 input, $0.00 cached, $15.00 output + +3. Write an `async def call_llm()` function that: + - Accepts `llm: LLM`, `messages: list[dict]`, `call_type: str`, `trace_id: str`, `max_tokens: int`, and optional `user_id: str` + - For OpenAI: uses `client.chat.completions.create()`. If the model is GPT-5-mini, uses `max_completion_tokens` parameter; otherwise uses `max_tokens` + - For Anthropic: extracts the system prompt from the messages list and passes it separately via the `system=` kwarg. Concatenates text blocks from the response + - Records every call to the database via `record_llm_call()` in a `finally` block (including trace_id, call_type, model, cost, latency) + - Returns an `LLMResponse` with `content: str`, `usage: LLMUsage`, and `raw_response` + +4. Write a `detect_frameworks_from_code(code: str) -> set[str]` function that: + - Parses import statements to identify ML frameworks: PyTorch, TensorFlow, JAX + - Detects both direct imports and aliases + - Returns a set of framework names found + +## Expected Outputs + +- A Python module with the LLM dataclass, model instances, call_llm function, and framework detection +- The call_llm function must handle both providers with their specific quirks +- record_llm_call must be in a finally block for observability diff --git a/tiles/codeflash-internal-docs/evals/summary.json b/tiles/codeflash-internal-docs/evals/summary.json new file mode 100644 index 000000000..f06b6544a --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/summary.json @@ -0,0 +1,40 @@ +{ + "total_scenarios": 5, + "capabilities_coverage": { + "total_capabilities": 14, + "capabilities_tested": 12, + "coverage_percentage": 85.7 + }, + "complexity_distribution": { + "basic": 1, + "intermediate": 2, + "advanced": 2 + }, + "scenarios": [ + { + "index": 1, + "capability": "model-distribution-formula, llm-cost-calculation, optimize-schema-structure", + "complexity": "intermediate" + }, + { + "index": 2, + "capability": "language-dispatch-routing, aiservice-endpoint-map, domain-models-relationships", + "complexity": "basic" + }, + { + "index": 3, + "capability": "context-extraction-types, llm-response-parsing, postprocessing-dedup", + "complexity": "intermediate" + }, + { + "index": 4, + "capability": "cf-api-route-ordering, aiservice-endpoint-map", + "complexity": "advanced" + }, + { + "index": 5, + "capability": "llm-dataclass-and-call, llm-client-setup, testgen-instrumentation", + "complexity": "advanced" + } + ] +} diff --git a/tiles/codeflash-internal-docs/evals/summary_infeasible.json b/tiles/codeflash-internal-docs/evals/summary_infeasible.json new file mode 100644 index 000000000..ef8a318a9 --- /dev/null +++ b/tiles/codeflash-internal-docs/evals/summary_infeasible.json @@ -0,0 +1,14 @@ +{ + "infeasible_capabilities": [ + { + "id": 1, + "name": "domain-models-relationships", + "reason": "The OptimizationFeatures, OptimizationEvents, and Repositories Django models are primarily data storage schemas. Testing whether an agent 'knows' their field names is a trivia quiz, not a realistic coding task. The models are partially covered by scenario-2 (schema conventions) but a dedicated scenario would devolve into rote recall rather than applied knowledge." + }, + { + "id": 2, + "name": "optimization-pipeline-flow", + "reason": "The full 6-step pipeline flow is an architectural overview that spans multiple modules and async patterns. A realistic eval would require the agent to orchestrate actual asyncio.TaskGroup calls with real LLM clients and context objects, which cannot be validated in a static code output scenario. Individual steps (distribution, context, postprocessing) are tested separately in scenarios 1, 3, and 5." + } + ] +} diff --git a/tiles/codeflash-internal-rules/evals/capabilities.json b/tiles/codeflash-internal-rules/evals/capabilities.json new file mode 100644 index 000000000..8e51f9712 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/capabilities.json @@ -0,0 +1,97 @@ +{ + "package_name": "codeflash-internal-rules", + "total_capabilities": 13, + "capabilities": [ + { + "id": 0, + "name": "python-async-endpoints", + "description": "All Django-Ninja endpoints must be async def, using asyncio.TaskGroup for concurrent operations and ninja.Schema or Pydantic BaseModel for request/response types", + "complexity": "basic", + "api_elements": ["async def", "asyncio.TaskGroup", "ninja.Schema", "BaseModel"] + }, + { + "id": 1, + "name": "libcst-for-transforms", + "description": "Use libcst for all code transformations that modify source code (preserves formatting); ast is only acceptable for read-only analysis", + "complexity": "intermediate", + "api_elements": ["libcst", "ast.parse", "ast.dump"] + }, + { + "id": 2, + "name": "llm-calls-via-llm-py", + "description": "All LLM provider calls must go through aiservice/llm.py, never calling provider APIs directly, with prompts stored as .md files rendered via Jinja2", + "complexity": "intermediate", + "api_elements": ["aiservice/llm.py", "Jinja2", ".md prompts"] + }, + { + "id": 3, + "name": "lazy-imports-in-routers", + "description": "Router endpoints must use lazy imports inside function bodies to avoid circular dependencies, with noqa: PLC0415 suppression comments", + "complexity": "basic", + "api_elements": ["noqa: PLC0415", "lazy import", "router dispatch"] + }, + { + "id": 4, + "name": "monorepo-directory-placement", + "description": "New code must be placed in the correct monorepo directory: language handlers in core/languages//, shared routers in core/shared/, tests in tests/ by feature", + "complexity": "basic", + "api_elements": ["core/languages/", "core/shared/", "tests/"] + }, + { + "id": 5, + "name": "registry-handler-pattern", + "description": "New language handlers must use @register_handler decorator, set supports_* flags, implement protocol methods, and be imported in core/languages/__init__.py", + "complexity": "advanced", + "api_elements": ["@register_handler", "LanguageRegistry", "supports_*", "core/languages/__init__.py"] + }, + { + "id": 6, + "name": "feature-dispatcher-protocol", + "description": "Handlers must implement runtime-checkable protocols (OptimizerProtocol, TestGenProtocol, CodeRepairProtocol) and integrate with get_handler_for_feature() dispatcher", + "complexity": "advanced", + "api_elements": ["Feature enum", "get_handler_for_feature", "OptimizerProtocol", "TestGenProtocol", "CodeRepairProtocol"] + }, + { + "id": 7, + "name": "conventional-commits-and-branching", + "description": "Commits must use conventional format (fix:, feat:, refactor:, etc.), branches must be created from main with cf-#-title naming, and commits must be atomic", + "complexity": "basic", + "api_elements": ["fix:", "feat:", "refactor:", "cf-#-title", "main"] + }, + { + "id": 8, + "name": "pre-commit-and-tooling", + "description": "Run prek pre-commit hooks via uv run prek run --all-files, use Ruff for linting/formatting, and uv for all Python package operations", + "complexity": "basic", + "api_elements": ["uv run prek run --all-files", "ruff check", "ruff format", "uv run"] + }, + { + "id": 9, + "name": "python-test-conventions", + "description": "Tests organized by feature in tests/, use @pytest.mark.asyncio for async tests, leverage test factories like create_optimizer_context(), and run via uv run pytest", + "complexity": "intermediate", + "api_elements": ["@pytest.mark.asyncio", "DJANGO_SETTINGS_MODULE", "create_optimizer_context", "uv run pytest"] + }, + { + "id": 10, + "name": "pr-review-guidelines", + "description": "PR reviews must focus on logic errors, security vulnerabilities, test name typos, and breaking changes; must skip style/formatting comments and limit to 5-7 high-signal comments", + "complexity": "intermediate", + "api_elements": ["logic errors", "security vulnerabilities", "5-7 comments"] + }, + { + "id": 11, + "name": "cfapi-webhook-and-di-patterns", + "description": "cf-api webhook routes must be registered before body parser for signature verification, and tests must use setXxxDependencies()/resetXxxDependencies() DI pattern", + "complexity": "intermediate", + "api_elements": ["webhook before body parser", "setXxxDependencies", "resetXxxDependencies", "instrument.ts"] + }, + { + "id": 12, + "name": "optimization-postprocessing", + "description": "Optimization candidates must be deduplicated via ast.parse()+ast.dump(), checked for no-op equality against original code, and use model distribution formula claude_calls=(total-1)//2", + "complexity": "advanced", + "api_elements": ["ast.parse", "ast.dump", "postprocess.py", "get_model_distribution", "claude_calls = (total - 1) // 2"] + } + ] +} diff --git a/tiles/codeflash-internal-rules/evals/scenario-1/capability.txt b/tiles/codeflash-internal-rules/evals/scenario-1/capability.txt new file mode 100644 index 000000000..7e69bde70 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-1/capability.txt @@ -0,0 +1 @@ +registry-handler-pattern, feature-dispatcher-protocol, python-async-endpoints, lazy-imports-in-routers, monorepo-directory-placement \ No newline at end of file diff --git a/tiles/codeflash-internal-rules/evals/scenario-1/criteria.json b/tiles/codeflash-internal-rules/evals/scenario-1/criteria.json new file mode 100644 index 000000000..cef810559 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-1/criteria.json @@ -0,0 +1,31 @@ +{ + "context": "Tests whether the agent correctly implements a new language handler following the registry pattern, protocol system, code style conventions, and monorepo directory structure", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Registry and protocol integration", + "description": "Handler class uses @register_handler('ruby') decorator, sets supports_optimizer=True and other supports_* to False, implements OptimizerProtocol with async optimizer_optimize method, and is imported in core/languages/__init__.py", + "max_score": 30 + }, + { + "name": "Lazy import dispatch in router", + "description": "optimizer_router.py dispatches 'ruby' language using a lazy import inside the endpoint body with # noqa: PLC0415 comment, following the existing pattern for js_ts", + "max_score": 20 + }, + { + "name": "Async and schema conventions", + "description": "Endpoint method is async def, uses ninja.Schema or Pydantic BaseModel for types, calls LLM through aiservice/llm.py (not provider APIs directly), and prompt is a .md file rendered with Jinja2", + "max_score": 25 + }, + { + "name": "Correct directory placement", + "description": "Handler is at core/languages/ruby/optimizer/optimizer.py, prompt .md file is alongside the module, test is at tests/optimizer/test_ruby_optimizer.py, and all __init__.py files are created", + "max_score": 15 + }, + { + "name": "Test conventions", + "description": "Test file uses @pytest.mark.asyncio for async tests, follows the feature-based test organization, and optionally uses test factories like create_optimizer_context()", + "max_score": 10 + } + ] +} diff --git a/tiles/codeflash-internal-rules/evals/scenario-1/task.md b/tiles/codeflash-internal-rules/evals/scenario-1/task.md new file mode 100644 index 000000000..292f2fb5f --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-1/task.md @@ -0,0 +1,38 @@ +# Scenario 1: Implement a Ruby Optimizer Handler + +## Context + +The codeflash-internal monorepo currently supports Python and JavaScript/TypeScript optimization through the multi-language handler system. The team wants to add Ruby as a new supported language for the optimizer feature. The handler should integrate with the existing registry, dispatcher, and router systems. + +The relevant existing code lives in: +- `django/aiservice/core/languages/` (per-language handlers) +- `django/aiservice/core/registry.py` (LanguageRegistry, @register_handler) +- `django/aiservice/core/dispatcher.py` (get_handler_for_feature, Feature enum) +- `django/aiservice/core/protocols/` (OptimizerProtocol, LanguageHandler) +- `django/aiservice/core/shared/optimizer_router.py` (shared router that dispatches by language) + +## Task + +1. Create a new Ruby optimizer handler at `django/aiservice/core/languages/ruby/optimizer/optimizer.py` that: + - Registers itself using the `@register_handler("ruby")` decorator + - Sets `supports_optimizer = True` and other `supports_*` flags to `False` + - Implements `OptimizerProtocol` with an `optimizer_optimize(request, data)` async method + - Uses `ninja.Schema` for any request/response models + - Calls LLM through the proper abstraction layer + - Stores the optimization prompt as a `.md` file alongside the module, rendered with Jinja2 + +2. Add the Ruby language routing in `core/shared/optimizer_router.py` using lazy imports with the `# noqa: PLC0415` suppression + +3. Import the Ruby handler module in `core/languages/__init__.py` so the decorator fires at startup + +4. Create a basic test file at `django/aiservice/tests/optimizer/test_ruby_optimizer.py` + +## Expected Outputs + +- `core/languages/ruby/__init__.py` +- `core/languages/ruby/optimizer/__init__.py` +- `core/languages/ruby/optimizer/optimizer.py` — handler class with decorator, protocol, async method +- `core/languages/ruby/optimizer/optimize_prompt.md` — Jinja2 prompt template +- Updated `core/shared/optimizer_router.py` — lazy import dispatch for "ruby" +- Updated `core/languages/__init__.py` — import of ruby module +- `tests/optimizer/test_ruby_optimizer.py` — async test with @pytest.mark.asyncio diff --git a/tiles/codeflash-internal-rules/evals/scenario-2/capability.txt b/tiles/codeflash-internal-rules/evals/scenario-2/capability.txt new file mode 100644 index 000000000..349f8c6a8 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-2/capability.txt @@ -0,0 +1 @@ +cfapi-webhook-and-di-patterns, monorepo-directory-placement, python-test-conventions \ No newline at end of file diff --git a/tiles/codeflash-internal-rules/evals/scenario-2/criteria.json b/tiles/codeflash-internal-rules/evals/scenario-2/criteria.json new file mode 100644 index 000000000..84a164340 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-2/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent correctly follows cf-api conventions for webhook registration order relative to body parser and uses the dependency injection testing pattern", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Webhook before body parser", + "description": "The Stripe webhook route is registered in the Express app BEFORE the body parser middleware, so the raw body is available for signature verification. This is the most critical requirement from the code-style rules.", + "max_score": 35 + }, + { + "name": "DI testing pattern", + "description": "Tests use setStripeDependencies() to inject mocks and resetStripeDependencies() in teardown, following the cf-api dependency injection convention rather than jest.mock() or similar", + "max_score": 30 + }, + { + "name": "Correct service architecture", + "description": "Code is placed in js/cf-api/routes/, uses Prisma from common/ with require-style imports for the CommonJS module, and follows Express routing conventions", + "max_score": 20 + }, + { + "name": "ESLint and Prettier compliance", + "description": "Code follows JS/TS style: ESLint + Prettier formatting, 100-char width without semicolons (cf-api config, not VSC-Extension config which uses 80 + semicolons)", + "max_score": 15 + } + ] +} diff --git a/tiles/codeflash-internal-rules/evals/scenario-2/task.md b/tiles/codeflash-internal-rules/evals/scenario-2/task.md new file mode 100644 index 000000000..879512b09 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-2/task.md @@ -0,0 +1,35 @@ +# Scenario 2: Add a Stripe Webhook Endpoint to cf-api + +## Context + +The codeflash-internal monorepo has a cf-api Express server at `js/cf-api/` that acts as the middleware layer between the VS Code Extension and the aiservice backend. The team needs to add a new Stripe webhook endpoint that processes subscription change events. + +Key architectural constraints: +- cf-api is an Express app running on port 3001 +- Webhook routes MUST be registered before the body parser middleware (raw body needed for Stripe signature verification) +- `instrument.ts` must be imported first in the entry point (Sentry) +- Tests use dependency injection: `setXxxDependencies()` / `resetXxxDependencies()` +- Prisma schema is in `common/prisma/schema.prisma`, shared by cf-api and cf-webapp +- `common` is CommonJS -- use `require`-style imports + +## Task + +1. Create a new webhook route handler at `js/cf-api/routes/stripeWebhook.ts` that: + - Exports a router that handles POST `/webhooks/stripe` + - Uses the raw request body for Stripe signature verification + - Processes `customer.subscription.updated` and `customer.subscription.deleted` events + - Updates the subscription status in the database via Prisma + +2. Register the webhook route in the Express app BEFORE the body parser middleware (this is critical for signature verification) + +3. Create a test file at `js/cf-api/__tests__/stripeWebhook.test.ts` that: + - Uses the dependency injection pattern (`setStripeDependencies()` / `resetStripeDependencies()`) + - Mocks the Stripe signature verification + - Tests both event types + +## Expected Outputs + +- `js/cf-api/routes/stripeWebhook.ts` -- route handler with raw body parsing +- Updated Express app setup showing webhook route registered before body parser +- `js/cf-api/__tests__/stripeWebhook.test.ts` -- tests using DI pattern +- Any dependency injection setup (e.g., `setStripeDependencies()`) diff --git a/tiles/codeflash-internal-rules/evals/scenario-3/capability.txt b/tiles/codeflash-internal-rules/evals/scenario-3/capability.txt new file mode 100644 index 000000000..310eec05f --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-3/capability.txt @@ -0,0 +1 @@ +conventional-commits-and-branching, pre-commit-and-tooling, lazy-imports-in-routers \ No newline at end of file diff --git a/tiles/codeflash-internal-rules/evals/scenario-3/criteria.json b/tiles/codeflash-internal-rules/evals/scenario-3/criteria.json new file mode 100644 index 000000000..f4a884aa2 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-3/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent follows git conventions (branching from main, conventional commits, branch naming, pre-commit) and maintains code style (lazy imports, async) when fixing a bug", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Conventional commit message", + "description": "Commit uses fix: prefix since this is a bug fix. Message is concise (1-2 sentences). Does not use feat:, refactor:, or other incorrect prefixes.", + "max_score": 25 + }, + { + "name": "Branch naming and creation", + "description": "Branch is created from main (not from current branch), follows cf-#-title pattern as cf-847-fix-language-alias-normalization or similar, lowercase and hyphenated", + "max_score": 25 + }, + { + "name": "Pre-commit and atomic commit", + "description": "Runs uv run prek run --all-files before committing, commit is atomic (one logical change: the bug fix and its test together), and Linear reference CF-847 is included", + "max_score": 25 + }, + { + "name": "Code style preserved in fix", + "description": "Fix preserves lazy import pattern with # noqa: PLC0415, router dispatch structure is maintained, and test uses @pytest.mark.asyncio and feature-based organization in tests/optimizer/", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-internal-rules/evals/scenario-3/task.md b/tiles/codeflash-internal-rules/evals/scenario-3/task.md new file mode 100644 index 000000000..f4012888f --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-3/task.md @@ -0,0 +1,33 @@ +# Scenario 3: Fix a Bug in the Optimizer Router and Commit + +## Context + +A bug has been reported on Linear issue CF-847: the optimizer router in `core/shared/optimizer_router.py` fails to handle the case where `data.language` is `"ts"` (a shorthand some clients send instead of `"typescript"`). This causes a 500 error because the request falls through to the Python handler, which cannot parse TypeScript code. + +The fix needs to normalize language aliases before dispatching. The developer should fix the bug, create tests, and commit the changes following the project's git conventions. + +## Task + +1. Fix the bug in `core/shared/optimizer_router.py`: + - Add normalization of language aliases (`"ts"` -> `"typescript"`, `"js"` -> `"javascript"`, `"py"` -> `"python"`) + - Preserve the existing lazy import pattern for dispatch + - Keep the fix minimal and focused + +2. Add a test in `tests/optimizer/test_language_normalization.py`: + - Test that language aliases are correctly normalized + - Use `@pytest.mark.asyncio` for async tests + - Follow the feature-based test organization + +3. Commit the changes following git conventions: + - Create a new branch from `main` named `cf-847-fix-language-alias-normalization` + - Run pre-commit checks: `uv run prek run --all-files` + - Make an atomic commit with a conventional commit message (this is a bug fix, so `fix:`) + - Include `CF-847` reference in the commit body or branch name + +## Expected Outputs + +- Updated `core/shared/optimizer_router.py` with alias normalization +- New `tests/optimizer/test_language_normalization.py` +- Git branch: `cf-847-fix-language-alias-normalization` (created from `main`) +- Commit message: `fix: normalize language aliases in optimizer router` (or similar conventional format) +- Pre-commit checks passed before commit diff --git a/tiles/codeflash-internal-rules/evals/scenario-4/capability.txt b/tiles/codeflash-internal-rules/evals/scenario-4/capability.txt new file mode 100644 index 000000000..05d6d6fa5 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-4/capability.txt @@ -0,0 +1 @@ +pr-review-guidelines, python-async-endpoints, python-test-conventions \ No newline at end of file diff --git a/tiles/codeflash-internal-rules/evals/scenario-4/criteria.json b/tiles/codeflash-internal-rules/evals/scenario-4/criteria.json new file mode 100644 index 000000000..b4df12769 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-4/criteria.json @@ -0,0 +1,31 @@ +{ + "context": "Tests whether the agent follows PR review guidelines: commenting only on critical issues (logic errors, security, test typos, breaking changes) and skipping style/formatting/suggestion comments", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Identifies security vulnerability", + "description": "Flags the hardcoded Redis password 'codeflash123' in the source code as a security vulnerability. Credentials should not be committed to the codebase.", + "max_score": 25 + }, + { + "name": "Identifies test name typo", + "description": "Catches the typo in the test docstring 'test optimzier cache hit' (optimzier -> optimizer). While this is a docstring not a method name, the agent should note that test method names with typos won't be discovered by the test runner -- here the method names are correct but this shows awareness of the rule.", + "max_score": 20 + }, + { + "name": "Identifies breaking change", + "description": "Flags that changing OptimizeRequestSchema.use_cache from no default to a default value (or adding a new required->optional field) could be a breaking change for existing clients, and the test file is in the wrong directory (core/shared/ instead of tests/)", + "max_score": 25 + }, + { + "name": "Skips style and suggestion comments", + "description": "Does NOT comment on code formatting, import ordering, variable naming, or offer 'consider using X' suggestions. Does not suggest performance improvements without profiling data. Stays within 5-7 comment limit.", + "max_score": 15 + }, + { + "name": "Identifies logic error with sync Redis in async endpoint", + "description": "Flags that using synchronous redis.Redis calls inside an async def endpoint will block the event loop. Should use async Redis client (aioredis or redis.asyncio).", + "max_score": 15 + } + ] +} diff --git a/tiles/codeflash-internal-rules/evals/scenario-4/task.md b/tiles/codeflash-internal-rules/evals/scenario-4/task.md new file mode 100644 index 000000000..b9dde5d50 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-4/task.md @@ -0,0 +1,82 @@ +# Scenario 4: Review a Pull Request + +## Context + +A teammate has submitted a PR that adds a caching layer to the optimizer. You need to review this PR following the project's PR review guidelines. The PR diff contains the following changes: + +### File: `core/shared/optimizer_router.py` + +```python +import redis +import json +import hashlib +from ninja import Router +from core.shared.optimizer_models import OptimizeRequestSchema, OptimizeResponseSchema + +router = Router() + +# Initialize Redis client +redis_client = redis.Redis(host="localhost", port=6379, db=0, password="codeflash123") + +@router.post("/optimize", response=OptimizeResponseSchema) +async def optimize(request, data: OptimizeRequestSchema): + # Generate cache key from request + cache_key = hashlib.md5(json.dumps(data.dict(), sort_keys=True).encode()).hexdigest() + + cached = redis_client.get(cache_key) + if cached: + return json.loads(cached) + + if data.language in ("javascript", "typescript"): + from core.languages.js_ts.optimizer import optimize_javascript # noqa: PLC0415 + result = await optimize_javascript(request, data) + else: + from core.languages.python.optimizer import optimize_python # noqa: PLC0415 + result = await optimize_python(request, data) + + redis_client.set(cache_key, json.dumps(result.dict()), ex=3600) + return result +``` + +### File: `core/shared/test_optimizer_cache.py` + +```python +import pytest +from unittest.mock import patch, MagicMock + +def test_cache_hit(): + """test optimzier cache hit""" + mock_redis = MagicMock() + mock_redis.get.return_value = '{"code": "cached"}' + with patch("core.shared.optimizer_router.redis_client", mock_redis): + # ... test implementation + pass + +def test_cache_miss(): + mock_redis = MagicMock() + mock_redis.get.return_value = None + with patch("core.shared.optimizer_router.redis_client", mock_redis): + # ... test implementation + pass +``` + +### File: `core/shared/optimizer_models.py` + +```python +# Added new field +class OptimizeRequestSchema(Schema): + code: str + language: str = "python" + use_cache: bool = True # Changed from no default +``` + +## Task + +Review this PR following the codeflash-internal PR review guidelines. Provide your review comments, focusing only on what the guidelines say to comment on, and explicitly skipping what they say to skip. + +## Expected Outputs + +- A list of review comments (limit to 5-7 high-signal comments) +- Each comment should identify a specific issue and explain why it matters +- Comments should only cover: logic errors, security vulnerabilities, test name typos, breaking changes +- Should NOT include: style/formatting nitpicks, "consider" suggestions, performance opinions without data diff --git a/tiles/codeflash-internal-rules/evals/scenario-5/capability.txt b/tiles/codeflash-internal-rules/evals/scenario-5/capability.txt new file mode 100644 index 000000000..fe5c38568 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-5/capability.txt @@ -0,0 +1 @@ +optimization-postprocessing, libcst-for-transforms, llm-calls-via-llm-py, python-test-conventions \ No newline at end of file diff --git a/tiles/codeflash-internal-rules/evals/scenario-5/criteria.json b/tiles/codeflash-internal-rules/evals/scenario-5/criteria.json new file mode 100644 index 000000000..c682e779d --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-5/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests whether the agent follows optimization postprocessing patterns (AST dedup, no-op checks, model distribution), code style conventions (libcst for transforms, ast for reads, LLM via llm.py, Jinja2 prompts), and test conventions", + "type": "weighted_checklist", + "checklist": [ + { + "name": "AST deduplication and no-op detection", + "description": "Postprocessor deduplicates candidates by normalizing JS/TS code to an AST representation and comparing dumps. Detects no-ops by comparing optimized code to original. Follows the pattern from the Python postprocessor.", + "max_score": 30 + }, + { + "name": "libcst vs ast usage", + "description": "Uses libcst for any code transformations that modify source (preserves formatting). Uses ast module only for read-only analysis (parsing, dumping for comparison). Does not use ast for code modification.", + "max_score": 25 + }, + { + "name": "LLM and prompt conventions", + "description": "LLM calls go through aiservice/llm.py (not direct provider API calls). Prompt is stored as a .md file alongside the module and rendered with Jinja2. Model distribution follows claude_calls = (total - 1) // 2 formula.", + "max_score": 25 + }, + { + "name": "Test structure and async conventions", + "description": "Tests are in tests/optimizer/ (feature-based organization), use @pytest.mark.asyncio for async tests, and test both deduplication and no-op detection scenarios", + "max_score": 20 + } + ] +} diff --git a/tiles/codeflash-internal-rules/evals/scenario-5/task.md b/tiles/codeflash-internal-rules/evals/scenario-5/task.md new file mode 100644 index 000000000..b2a9f6f03 --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/scenario-5/task.md @@ -0,0 +1,43 @@ +# Scenario 5: Add a JavaScript/TypeScript Postprocessor for Optimization Deduplication + +## Context + +The codeflash-internal optimizer postprocesses optimization candidates to remove duplicates and no-ops. Currently, the Python postprocessor in `core/languages/python/optimizer/postprocess.py` deduplicates by normalizing code with `ast.parse()` + `ast.dump()` and comparing AST dumps. It uses `libcst` for any code transformations that modify source. + +The team needs a JavaScript/TypeScript equivalent postprocessor that follows the same patterns adapted for JS/TS, using the appropriate tools for that language while following the same architectural conventions. + +The relevant existing patterns: +- `BaseOptimizerContext` in `optimizer_context.py` handles prompt management and code extraction +- `extract_code_and_explanation_from_llm_res()` parses LLM markdown response into code blocks +- `parse_and_generate_candidate_schema()` converts extracted code to `OptimizeResponseItemSchema` +- Model distribution: `claude_calls = (total - 1) // 2`, `gpt_calls = total - claude_calls` +- LLM calls go through `aiservice/llm.py`, prompts are `.md` files rendered with Jinja2 + +## Task + +1. Create a postprocessor at `core/languages/js_ts/optimizer/postprocess.py` that: + - Deduplicates optimization candidates for JavaScript/TypeScript code + - Uses a JS/TS AST normalization approach (can shell out to a Node.js script or use a Python JS parser) + - Checks for no-op equality (optimized code identical to original) + - Uses `libcst` for any Python source transforms if needed (not `ast` module for transforms) + - Uses `ast` only for read-only analysis operations + - Follows the `OptimizeResponseItemSchema` output format + +2. Create a prompt template at `core/languages/js_ts/optimizer/postprocess_prompt.md` for any LLM-assisted dedup decisions, using Jinja2 template syntax + +3. Ensure the postprocessor integrates with the model distribution formula: + - `claude_calls = (total - 1) // 2` + - `gpt_calls = total - claude_calls` + - `MAX_OPTIMIZER_CALLS = 6` + +4. Create tests at `tests/optimizer/test_js_ts_postprocess.py`: + - Test deduplication of identical-AST candidates + - Test no-op detection + - Use `@pytest.mark.asyncio` for async tests + - Use test factories where applicable + +## Expected Outputs + +- `core/languages/js_ts/optimizer/postprocess.py` -- dedup and validation logic +- `core/languages/js_ts/optimizer/postprocess_prompt.md` -- Jinja2 prompt template +- `tests/optimizer/test_js_ts_postprocess.py` -- async tests for dedup and no-op detection diff --git a/tiles/codeflash-internal-rules/evals/summary.json b/tiles/codeflash-internal-rules/evals/summary.json new file mode 100644 index 000000000..e23226aae --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/summary.json @@ -0,0 +1,70 @@ +{ + "package_name": "codeflash-internal-rules", + "total_scenarios": 5, + "total_capabilities_tested": 13, + "scenarios": [ + { + "id": 1, + "name": "Implement a Ruby Optimizer Handler", + "description": "Create a new language handler for Ruby following the registry pattern, protocol system, async conventions, and monorepo directory structure", + "capabilities_tested": [5, 6, 0, 3, 4], + "capability_names": ["registry-handler-pattern", "feature-dispatcher-protocol", "python-async-endpoints", "lazy-imports-in-routers", "monorepo-directory-placement"], + "rules_covered": ["multi-language-handlers", "code-style", "architecture", "optimization-patterns"] + }, + { + "id": 2, + "name": "Add a Stripe Webhook Endpoint to cf-api", + "description": "Add a webhook endpoint ensuring correct registration order relative to body parser and testing with DI pattern", + "capabilities_tested": [11, 4, 9], + "capability_names": ["cfapi-webhook-and-di-patterns", "monorepo-directory-placement", "python-test-conventions"], + "rules_covered": ["code-style", "architecture", "testing-rules"] + }, + { + "id": 3, + "name": "Fix a Bug in the Optimizer Router and Commit", + "description": "Fix a language alias bug, write tests, and commit following git conventions with proper branch naming, conventional commits, and pre-commit checks", + "capabilities_tested": [7, 8, 3], + "capability_names": ["conventional-commits-and-branching", "pre-commit-and-tooling", "lazy-imports-in-routers"], + "rules_covered": ["git-conventions", "code-style", "optimization-patterns"] + }, + { + "id": 4, + "name": "Review a Pull Request", + "description": "Review a PR containing security issues, test typos, and breaking changes while following the guidelines to skip style and suggestion comments", + "capabilities_tested": [10, 0, 9], + "capability_names": ["pr-review-guidelines", "python-async-endpoints", "python-test-conventions"], + "rules_covered": ["testing-rules", "code-style"] + }, + { + "id": 5, + "name": "Add a JS/TS Postprocessor for Optimization Deduplication", + "description": "Implement a postprocessor following optimization patterns (AST dedup, model distribution), code style (libcst for transforms, llm.py for calls, Jinja2 prompts), and test conventions", + "capabilities_tested": [12, 1, 2, 9], + "capability_names": ["optimization-postprocessing", "libcst-for-transforms", "llm-calls-via-llm-py", "python-test-conventions"], + "rules_covered": ["optimization-patterns", "code-style", "testing-rules"] + } + ], + "capability_coverage": { + "0": {"name": "python-async-endpoints", "tested_in": [1, 4]}, + "1": {"name": "libcst-for-transforms", "tested_in": [5]}, + "2": {"name": "llm-calls-via-llm-py", "tested_in": [5]}, + "3": {"name": "lazy-imports-in-routers", "tested_in": [1, 3]}, + "4": {"name": "monorepo-directory-placement", "tested_in": [1, 2]}, + "5": {"name": "registry-handler-pattern", "tested_in": [1]}, + "6": {"name": "feature-dispatcher-protocol", "tested_in": [1]}, + "7": {"name": "conventional-commits-and-branching", "tested_in": [3]}, + "8": {"name": "pre-commit-and-tooling", "tested_in": [3]}, + "9": {"name": "python-test-conventions", "tested_in": [2, 4, 5]}, + "10": {"name": "pr-review-guidelines", "tested_in": [4]}, + "11": {"name": "cfapi-webhook-and-di-patterns", "tested_in": [2]}, + "12": {"name": "optimization-postprocessing", "tested_in": [5]} + }, + "rules_coverage": { + "code-style": [1, 2, 3, 4, 5], + "architecture": [1, 2], + "optimization-patterns": [1, 3, 5], + "git-conventions": [3], + "testing-rules": [2, 4, 5], + "multi-language-handlers": [1] + } +} diff --git a/tiles/codeflash-internal-rules/evals/summary_infeasible.json b/tiles/codeflash-internal-rules/evals/summary_infeasible.json new file mode 100644 index 000000000..36b7e721b --- /dev/null +++ b/tiles/codeflash-internal-rules/evals/summary_infeasible.json @@ -0,0 +1,5 @@ +{ + "package_name": "codeflash-internal-rules", + "infeasible_scenarios": [], + "notes": "All 5 scenarios are feasible. Each scenario tests realistic implementation tasks that an agent with internalized rules should be able to complete. No scenarios require access to external services, live databases, or proprietary APIs that would make them infeasible to evaluate." +} diff --git a/tiles/codeflash-internal-skills/evals/capabilities.json b/tiles/codeflash-internal-skills/evals/capabilities.json new file mode 100644 index 000000000..14d7aa632 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/capabilities.json @@ -0,0 +1,97 @@ +{ + "package_name": "codeflash-internal-skills", + "total_capabilities": 13, + "capabilities": [ + { + "id": 0, + "name": "diagnose-request-validation-failure", + "description": "Identify when an optimization or testgen request fails due to invalid schema fields (missing source_code, unsupported language, bad n_candidates)", + "complexity": "basic", + "api_elements": ["core/shared/optimizer_models.py", "OptimizeSchema", "Pydantic validation"] + }, + { + "id": 1, + "name": "diagnose-router-dispatch-failure", + "description": "Identify when router dispatch fails because a language is unsupported or a lazy import raises ImportError", + "complexity": "basic", + "api_elements": ["core/shared/optimizer_router.py", "core/shared/testgen_router.py", "data.language"] + }, + { + "id": 2, + "name": "diagnose-llm-client-failure", + "description": "Identify when LLM calls fail due to missing environment variables, None client, or API key issues", + "complexity": "intermediate", + "api_elements": ["aiservice/llm.py", "get_llm_client()", "AZURE_OPENAI_API_KEY", "ANTHROPIC_FOUNDRY_API_KEY"] + }, + { + "id": 3, + "name": "diagnose-postprocessing-removal", + "description": "Identify when all optimization candidates are removed by deduplication or equality checks in postprocessing", + "complexity": "intermediate", + "api_elements": ["core/languages/python/optimizer/postprocess.py", "deduplicate_optimizations()", "equality_check()"] + }, + { + "id": 4, + "name": "diagnose-logging-discrepancy", + "description": "Use optimization logging (optimizations_raw vs optimizations_post) to pinpoint where candidates are lost", + "complexity": "basic", + "api_elements": ["core/log_features/models.py", "OptimizationFeatures", "record_llm_call()"] + }, + { + "id": 5, + "name": "create-language-directory-structure", + "description": "Create the correct directory layout for a new language under core/languages/ with __init__.py, optimizer/, and testgen/ subdirectories", + "complexity": "basic", + "api_elements": ["core/languages//", "core/languages//optimizer/", "core/languages//testgen/"] + }, + { + "id": 6, + "name": "implement-language-handler", + "description": "Implement a handler class with @register_handler decorator, correct supports_* flags, and protocol methods", + "complexity": "intermediate", + "api_elements": ["@register_handler", "LanguageHandler", "supports_testgen", "supports_optimizer", "core/protocols/base.py"] + }, + { + "id": 7, + "name": "update-routers-for-language", + "description": "Add lazy-import dispatch branches to optimizer_router.py and testgen_router.py for a new language", + "complexity": "intermediate", + "api_elements": ["core/shared/optimizer_router.py", "core/shared/testgen_router.py", "lazy import", "noqa: PLC0415"] + }, + { + "id": 8, + "name": "add-aiservice-endpoint", + "description": "Create a Django-Ninja endpoint with NinjaAPI router, ninja.Schema request/response types, async def handler, and AuthenticatedRequest", + "complexity": "intermediate", + "api_elements": ["NinjaAPI", "ninja.Schema", "AuthenticatedRequest", "aiservice/urls.py", "async def"] + }, + { + "id": 9, + "name": "add-cfapi-express-endpoint", + "description": "Create an Express endpoint in cf-api with correct route registration order (webhooks before body parser) and middleware", + "complexity": "advanced", + "api_elements": ["js/cf-api/endpoints/", "js/cf-api/routes/index.ts", "addAsync(Router())", "checkForValidAPIKey"] + }, + { + "id": 10, + "name": "register-endpoint-urls", + "description": "Register endpoints in aiservice/urls.py (path pattern ai/) or cf-api routes/index.ts with correct middleware ordering", + "complexity": "basic", + "api_elements": ["aiservice/urls.py", "urlpatterns", "js/cf-api/routes/index.ts"] + }, + { + "id": 11, + "name": "diagnose-testgen-prompt-failure", + "description": "Identify when test generation fails due to empty or malformed prompts from Jinja2 template rendering issues", + "complexity": "advanced", + "api_elements": ["core/languages/python/testgen/testgen.py", "build_prompt()", "Jinja2 templates"] + }, + { + "id": 12, + "name": "diagnose-testgen-instrumentation-failure", + "description": "Identify when instrumented tests fail to compile due to import handling or device sync injection issues", + "complexity": "advanced", + "api_elements": ["core/languages/python/testgen/instrumentation/instrument_new_tests.py", "instrument_tests()", "detect_frameworks_from_code()", "_create_device_sync_precompute_statements()"] + } + ] +} diff --git a/tiles/codeflash-internal-skills/evals/scenario-1/capability.txt b/tiles/codeflash-internal-skills/evals/scenario-1/capability.txt new file mode 100644 index 000000000..d778db19a --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-1/capability.txt @@ -0,0 +1 @@ +Diagnose optimization failure caused by unsupported language in router dispatch and identify the correct fix path. diff --git a/tiles/codeflash-internal-skills/evals/scenario-1/criteria.json b/tiles/codeflash-internal-skills/evals/scenario-1/criteria.json new file mode 100644 index 000000000..8b66255f2 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-1/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests ability to diagnose a router dispatch failure caused by an unsupported language and identify the correct fix path", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Identifies router dispatch stage", + "description": "Correctly identifies that the failure occurs at the router dispatch stage (Step 2) in core/shared/optimizer_router.py, not at request validation or LLM calls", + "max_score": 30 + }, + { + "name": "Explains unsupported language cause", + "description": "Explains that 'rust' is not among the supported languages (python, javascript, typescript, java) and there is no dispatch branch for it", + "max_score": 25 + }, + { + "name": "References correct files", + "description": "References both core/shared/optimizer_router.py (where dispatch happens) and core/shared/optimizer_models.py (where validation could be added)", + "max_score": 20 + }, + { + "name": "Proposes valid fix", + "description": "Proposes either adding language validation to the schema to reject unsupported languages at request time, or implementing Rust support via the add-language-support workflow", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-internal-skills/evals/scenario-1/task.md b/tiles/codeflash-internal-skills/evals/scenario-1/task.md new file mode 100644 index 000000000..461d51655 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-1/task.md @@ -0,0 +1,38 @@ +# Scenario: Optimization request for "rust" returns 400 error + +## Context + +A user reports that their optimization request fails immediately with a 400 error. The request payload is: + +```json +{ + "source_code": "fn main() { println!(\"Hello, world!\"); }", + "trace_id": "abc-123", + "language": "rust", + "n_candidates": 5 +} +``` + +The server log shows: + +``` +ERROR 2026-02-14 10:22:15 optimizer_router: Unhandled language dispatch for 'rust' +``` + +No candidates are returned. The response body is `{"message": "Internal server error"}`. + +## Task + +Diagnose why this optimization request fails. Walk through the relevant stages of the optimization pipeline to identify the root cause. Provide: + +1. The exact stage where the failure occurs. +2. The file(s) responsible for the failure. +3. An explanation of why "rust" causes the failure. +4. The fix: what would need to change to either reject "rust" cleanly with a 400 validation error, or to support it as a new language. + +## Expected Outputs + +- Identification that the failure is at the **router dispatch** stage (Step 2 of the debug-optimization-failure workflow). +- Reference to `core/shared/optimizer_router.py` as the file where dispatch happens. +- Explanation that "rust" is not in the set of supported languages (`python`, `javascript`, `typescript`, `java`), so the router has no branch for it. +- A recommendation to either add `"rust"` validation to `OptimizeSchema` in `core/shared/optimizer_models.py` so it rejects unsupported languages at Step 1, or to implement Rust support following the add-language-support workflow. diff --git a/tiles/codeflash-internal-skills/evals/scenario-2/capability.txt b/tiles/codeflash-internal-skills/evals/scenario-2/capability.txt new file mode 100644 index 000000000..cc4db9b94 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-2/capability.txt @@ -0,0 +1 @@ +Diagnose postprocessing stage removing all optimization candidates and use logging data to confirm the failure point. diff --git a/tiles/codeflash-internal-skills/evals/scenario-2/criteria.json b/tiles/codeflash-internal-skills/evals/scenario-2/criteria.json new file mode 100644 index 000000000..5081978b0 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-2/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests ability to diagnose postprocessing as the stage that removed all candidates, and to use logging data to confirm the diagnosis", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Identifies postprocessing stage", + "description": "Correctly identifies the postprocessing stage (Step 5) in core/languages/python/optimizer/postprocess.py as where candidates are lost", + "max_score": 25 + }, + { + "name": "Explains both postprocessing checks", + "description": "Explains that deduplicate_optimizations() removes AST-identical candidates via ast.parse()/ast.dump(), and equality_check() removes candidates identical to the original code", + "max_score": 25 + }, + { + "name": "Uses logging to confirm diagnosis", + "description": "References the logging table (optimizations_raw vs optimizations_post in core/log_features/models.py) to confirm that candidates existed before postprocessing but were all filtered out", + "max_score": 25 + }, + { + "name": "Provides actionable recommendation", + "description": "Recommends increasing n_candidates, improving prompt quality, or checking that the function is non-trivial enough to optimize", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-internal-skills/evals/scenario-2/task.md b/tiles/codeflash-internal-skills/evals/scenario-2/task.md new file mode 100644 index 000000000..1327facef --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-2/task.md @@ -0,0 +1,44 @@ +# Scenario: All optimization candidates silently disappear + +## Context + +An optimization request for Python code completes without errors, but returns zero candidates. The request payload is valid: + +```json +{ + "source_code": "def add(a, b):\n return a + b", + "trace_id": "trace-456", + "language": "python", + "n_candidates": 5 +} +``` + +The server logs show: + +``` +INFO 2026-02-14 11:05:00 llm: call_llm completed for model gpt-4o, received response +INFO 2026-02-14 11:05:01 llm: call_llm completed for model gpt-4o, received response +INFO 2026-02-14 11:05:01 llm: call_llm completed for model claude-sonnet, received response +INFO 2026-02-14 11:05:02 postprocess: deduplicate_optimizations removed 4 of 5 candidates +INFO 2026-02-14 11:05:02 postprocess: equality_check removed 1 of 1 remaining candidates +INFO 2026-02-14 11:05:02 optimizer: 0 candidates after postprocessing +``` + +The optimization logging table shows `optimizations_raw = 5` but `optimizations_post = 0`. + +## Task + +Diagnose why all candidates were removed. Walk through the optimization pipeline to find the failure stage, explain why this happens for a trivial function like `add`, and recommend a fix. + +1. Identify which stage removed the candidates. +2. Explain the two postprocessing checks that reduced 5 candidates to 0. +3. Explain why `optimizations_raw = 5` but `optimizations_post = 0` in the logging table. +4. Recommend what to do when all candidates are removed by postprocessing. + +## Expected Outputs + +- Identification that the failure is at the **postprocessing** stage (Step 5 of the debug-optimization-failure workflow). +- Explanation that `deduplicate_optimizations()` in `core/languages/python/optimizer/postprocess.py` uses `ast.parse()` + `ast.dump()` to remove candidates with identical ASTs, and `equality_check()` removes candidates identical to the original code. +- For a trivial function like `add`, all LLM candidates likely generate the same code (or code identical to the original), so dedup and equality checks remove everything. +- The logging discrepancy (`optimizations_raw` vs `optimizations_post`) confirms candidates existed before postprocessing but were all filtered. +- Recommendation: increase `n_candidates`, improve prompt quality, or adjust dedup thresholds. diff --git a/tiles/codeflash-internal-skills/evals/scenario-3/capability.txt b/tiles/codeflash-internal-skills/evals/scenario-3/capability.txt new file mode 100644 index 000000000..7fde1a0fc --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-3/capability.txt @@ -0,0 +1 @@ +Add a new language (Go) to the optimization system following the 7-step add-language-support workflow. diff --git a/tiles/codeflash-internal-skills/evals/scenario-3/criteria.json b/tiles/codeflash-internal-skills/evals/scenario-3/criteria.json new file mode 100644 index 000000000..ac49ed850 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-3/criteria.json @@ -0,0 +1,31 @@ +{ + "context": "Tests ability to follow the add-language-support workflow to add Go, including handler registration, router updates, and correct supports_* flags", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Correct directory structure", + "description": "Creates core/languages/go/ with __init__.py, optimizer/__init__.py, and optimizer/optimizer.py following the existing language module pattern", + "max_score": 20 + }, + { + "name": "Handler class with decorator and flags", + "description": "Implements GoHandler with @register_handler('go') decorator, supports_optimizer=True, supports_testgen=False, and all other supports_* flags explicitly set to False", + "max_score": 25 + }, + { + "name": "Router dispatch with lazy import", + "description": "Adds an 'if data.language == \"go\"' branch to core/shared/optimizer_router.py using a lazy import inside the function body (not at module level)", + "max_score": 25 + }, + { + "name": "Module registration", + "description": "Adds import of core.languages.go in core/languages/__init__.py so the @register_handler decorator fires on startup", + "max_score": 15 + }, + { + "name": "Test plan", + "description": "Describes tests for handler registration (registry.get_handler('go')), feature dispatch (get_handler_for_feature), and a mocked end-to-end optimization flow using @pytest.mark.asyncio", + "max_score": 15 + } + ] +} diff --git a/tiles/codeflash-internal-skills/evals/scenario-3/task.md b/tiles/codeflash-internal-skills/evals/scenario-3/task.md new file mode 100644 index 000000000..7ab01bfc9 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-3/task.md @@ -0,0 +1,30 @@ +# Scenario: Add Go language support to the optimization system + +## Context + +The team wants to add Go as a supported language for the code optimization pipeline. Go support should include optimization but not test generation. The handler should follow the same patterns as the existing Python and JavaScript/TypeScript implementations. + +The existing language modules are: +- `core/languages/python/` -- full support (optimizer, testgen, code repair) +- `core/languages/js_ts/` -- optimizer and testgen +- `core/languages/java/` -- optimizer and testgen + +## Task + +Outline the complete implementation plan for adding Go language support. Provide: + +1. The exact directory structure to create. +2. The handler class implementation with correct decorator and flags. +3. The router changes needed in `core/shared/optimizer_router.py`. +4. The module registration approach so `@register_handler` fires on startup. +5. The test plan for verifying the new language works. + +Write the implementation code for the handler class and the router dispatch change. Do NOT implement the actual optimization logic -- just the skeleton. + +## Expected Outputs + +- Directory structure: `core/languages/go/` with `__init__.py`, `optimizer/__init__.py`, `optimizer/optimizer.py`. +- Handler class in `core/languages/go/__init__.py` using `@register_handler("go")` with `supports_optimizer = True` and all other `supports_*` flags set to `False`. +- Router change: a new `if data.language == "go":` branch in `core/shared/optimizer_router.py` with a lazy import of `optimize_go` from `core.languages.go.optimizer`. +- Registration: import added to `core/languages/__init__.py`. +- Tests: handler registration test, feature dispatch test, mocked optimization flow test, all using `@pytest.mark.asyncio`. diff --git a/tiles/codeflash-internal-skills/evals/scenario-4/capability.txt b/tiles/codeflash-internal-skills/evals/scenario-4/capability.txt new file mode 100644 index 000000000..6931dd247 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-4/capability.txt @@ -0,0 +1 @@ +Add a new API endpoint to both aiservice (Django-Ninja) and cf-api (Express) with correct schemas, auth, and middleware ordering. diff --git a/tiles/codeflash-internal-skills/evals/scenario-4/criteria.json b/tiles/codeflash-internal-skills/evals/scenario-4/criteria.json new file mode 100644 index 000000000..31ad9551a --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-4/criteria.json @@ -0,0 +1,31 @@ +{ + "context": "Tests ability to add API endpoints to both aiservice (Django-Ninja) and cf-api (Express) with correct schemas, auth, registration, and middleware ordering", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Correct Django-Ninja schemas", + "description": "Defines request and response schemas using ninja.Schema with correct field types and defaults (source_code: str, language: str = 'python', cyclomatic_complexity: int, etc.)", + "max_score": 20 + }, + { + "name": "Async aiservice endpoint with auth", + "description": "Creates an async def endpoint using AuthenticatedRequest, with response dict mapping status codes to schemas (200 for success, 400/500 for errors)", + "max_score": 20 + }, + { + "name": "Aiservice URL registration", + "description": "Registers the endpoint in aiservice/urls.py with path('ai/code-complexity', code_complexity_api.urls) following the kebab-case naming convention", + "max_score": 20 + }, + { + "name": "Express endpoint handler", + "description": "Creates an async handler function in js/cf-api/endpoints/code-complexity.ts that accepts Request/Response and returns JSON", + "max_score": 15 + }, + { + "name": "CF-API route registration with middleware", + "description": "Registers the route in the protected routes section of js/cf-api/routes/index.ts (after checkForValidAPIKey, not before body parser) with trackUsage middleware", + "max_score": 25 + } + ] +} diff --git a/tiles/codeflash-internal-skills/evals/scenario-4/task.md b/tiles/codeflash-internal-skills/evals/scenario-4/task.md new file mode 100644 index 000000000..a76e32794 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-4/task.md @@ -0,0 +1,38 @@ +# Scenario: Add a code-complexity analysis endpoint to both aiservice and cf-api + +## Context + +The team needs a new "code complexity" endpoint that accepts source code and returns complexity metrics (cyclomatic complexity, lines of code, etc.). This endpoint must exist in both: + +1. **aiservice** (Django-Ninja) at `/ai/code-complexity` -- performs the actual analysis +2. **cf-api** (Express) at `/code-complexity` -- proxies to aiservice, requires API key auth + +The aiservice endpoint should: +- Accept `source_code` (required string) and `language` (optional, default "python") +- Return `cyclomatic_complexity` (int), `lines_of_code` (int), and `maintainability_index` (float) +- Use `AuthenticatedRequest` for auth +- Be async + +The cf-api endpoint should: +- Be a protected route (behind `checkForValidAPIKey`) +- Forward the request to aiservice +- Track usage via `trackUsage` middleware + +## Task + +Write the implementation skeleton for both endpoints. Provide: + +1. The Django-Ninja schemas (request and response). +2. The aiservice router with the endpoint function signature. +3. The `aiservice/urls.py` registration line. +4. The Express endpoint handler in `js/cf-api/endpoints/`. +5. The route registration in `js/cf-api/routes/index.ts` with correct middleware ordering. + +## Expected Outputs + +- Request schema using `ninja.Schema` with `source_code: str` and `language: str = "python"`. +- Response schema with `cyclomatic_complexity: int`, `lines_of_code: int`, `maintainability_index: float`. +- Async endpoint function using `AuthenticatedRequest` and returning `tuple[int, ResponseSchema | ErrorSchema]`. +- URL registration: `path("ai/code-complexity", code_complexity_api.urls)` in `aiservice/urls.py`. +- Express handler as an async function in `js/cf-api/endpoints/code-complexity.ts`. +- Route registered in the **protected routes** section of `js/cf-api/routes/index.ts` (after `checkForValidAPIKey`), with `trackUsage` middleware applied. diff --git a/tiles/codeflash-internal-skills/evals/scenario-5/capability.txt b/tiles/codeflash-internal-skills/evals/scenario-5/capability.txt new file mode 100644 index 000000000..255bad972 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-5/capability.txt @@ -0,0 +1 @@ +Diagnose test generation instrumentation failure where GPU sync injection introduces syntax errors in PyTorch test code. diff --git a/tiles/codeflash-internal-skills/evals/scenario-5/criteria.json b/tiles/codeflash-internal-skills/evals/scenario-5/criteria.json new file mode 100644 index 000000000..9d11238b3 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-5/criteria.json @@ -0,0 +1,26 @@ +{ + "context": "Tests ability to diagnose test generation instrumentation failures where GPU sync injection introduces syntax errors", + "type": "weighted_checklist", + "checklist": [ + { + "name": "Identifies instrumentation stage", + "description": "Correctly identifies the instrumentation stage (Step 7) in core/languages/python/testgen/instrumentation/instrument_new_tests.py as the failure point, not postprocessing or LLM response", + "max_score": 25 + }, + { + "name": "Explains framework detection and sync injection", + "description": "Explains that detect_frameworks_from_code() found PyTorch, which triggered _create_device_sync_precompute_statements() to inject torch.cuda.synchronize() calls for GPU timing accuracy", + "max_score": 25 + }, + { + "name": "Diagnoses the syntax error cause", + "description": "Identifies that the sync statement was injected without proper newline/whitespace separation, causing it to concatenate with an existing line (torch.cuda.synchronize()import torch)", + "max_score": 30 + }, + { + "name": "Recommends fix approach", + "description": "Recommends fixing the injection logic to add proper newlines and suggests adding a post-instrumentation compilation check to catch such errors before returning", + "max_score": 20 + } + ] +} diff --git a/tiles/codeflash-internal-skills/evals/scenario-5/task.md b/tiles/codeflash-internal-skills/evals/scenario-5/task.md new file mode 100644 index 000000000..4c4fa6b4b --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/scenario-5/task.md @@ -0,0 +1,52 @@ +# Scenario: Generated tests fail to compile after instrumentation + +## Context + +Test generation for a Python function that uses PyTorch produces tests that fail to compile. The function under test is: + +```python +import torch + +def normalize_tensor(t: torch.Tensor) -> torch.Tensor: + return (t - t.mean()) / t.std() +``` + +The testgen request completes successfully -- the LLM returns valid test code. However, after the instrumentation stage, the tests fail with: + +``` +SyntaxError: invalid syntax + File "test_normalize_tensor_instrumented.py", line 15 + torch.cuda.synchronize()import torch + ^^^^^^ +``` + +The raw (pre-instrumentation) test code compiles fine. The issue only appears after instrumentation. + +Server logs: + +``` +INFO 2026-02-14 14:00:00 testgen: build_prompt completed, prompts non-empty +INFO 2026-02-14 14:00:02 llm: call_llm completed for model gpt-4o +INFO 2026-02-14 14:00:03 postprocessing: add_missing_imports completed +INFO 2026-02-14 14:00:03 instrumentation: detect_frameworks_from_code found ['torch'] +INFO 2026-02-14 14:00:03 instrumentation: _create_device_sync_precompute_statements injecting cuda.synchronize() +ERROR 2026-02-14 14:00:03 instrumentation: instrumented tests failed to compile - SyntaxError +``` + +## Task + +Diagnose why the instrumented tests fail to compile. Walk through the test generation pipeline, focusing on the instrumentation stage. Provide: + +1. The exact stage where the failure occurs. +2. The specific function responsible for the syntax error. +3. An explanation of what `_create_device_sync_precompute_statements()` does and how it can introduce syntax errors. +4. The file to investigate and what to look for. +5. A recommendation for fixing the instrumentation bug. + +## Expected Outputs + +- Identification that the failure is at the **instrumentation** stage (Step 7 of the debug-test-generation workflow). +- The file responsible is `core/languages/python/testgen/instrumentation/instrument_new_tests.py`. +- Explanation that `detect_frameworks_from_code()` detected PyTorch, which triggered `_create_device_sync_precompute_statements()` to inject `torch.cuda.synchronize()` calls for GPU timing accuracy. +- The syntax error (`torch.cuda.synchronize()import torch`) indicates the sync statement was injected without a newline separator, concatenating with an existing import line. +- Recommendation: check the injection logic in `_create_device_sync_precompute_statements()` to ensure it adds proper newlines/indentation when inserting sync calls, and add a compilation check after instrumentation to catch such issues before returning. diff --git a/tiles/codeflash-internal-skills/evals/summary.json b/tiles/codeflash-internal-skills/evals/summary.json new file mode 100644 index 000000000..76aed91e0 --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/summary.json @@ -0,0 +1,47 @@ +{ + "package_name": "codeflash-internal-skills", + "total_scenarios": 5, + "total_capabilities_tested": 13, + "scenarios": [ + { + "id": "scenario-1", + "title": "Optimization request for unsupported language (rust) returns error", + "skills_tested": ["debug-optimization-failure"], + "capabilities_tested": [0, 1], + "complexity": "basic", + "description": "A request with language='rust' fails at router dispatch because there is no handler for Rust. Tests ability to trace the failure to the dispatch stage and recommend either schema validation or new language support." + }, + { + "id": "scenario-2", + "title": "All optimization candidates removed by postprocessing", + "skills_tested": ["debug-optimization-failure"], + "capabilities_tested": [3, 4], + "complexity": "intermediate", + "description": "A valid Python optimization request produces 5 LLM candidates, but all are removed by deduplication and equality checks. Tests ability to diagnose postprocessing as the failure stage and use logging data to confirm." + }, + { + "id": "scenario-3", + "title": "Add Go language support to the optimization system", + "skills_tested": ["add-language-support"], + "capabilities_tested": [5, 6, 7], + "complexity": "intermediate", + "description": "Implement Go as a new language with optimizer support only. Tests ability to follow the 7-step add-language-support workflow: directory structure, handler class with @register_handler, router dispatch with lazy imports, and test plan." + }, + { + "id": "scenario-4", + "title": "Add code-complexity endpoint to aiservice and cf-api", + "skills_tested": ["add-api-endpoint"], + "capabilities_tested": [8, 9, 10], + "complexity": "advanced", + "description": "Create a new endpoint in both Django-Ninja (aiservice) and Express (cf-api). Tests schemas, async handlers, AuthenticatedRequest, URL registration, and correct middleware ordering in Express route registration." + }, + { + "id": "scenario-5", + "title": "Instrumented PyTorch tests fail to compile", + "skills_tested": ["debug-test-generation"], + "capabilities_tested": [11, 12], + "complexity": "advanced", + "description": "Test generation succeeds but instrumentation injects torch.cuda.synchronize() without proper newlines, creating a syntax error. Tests ability to trace the failure to the instrumentation stage and identify the GPU sync injection bug." + } + ] +} diff --git a/tiles/codeflash-internal-skills/evals/summary_infeasible.json b/tiles/codeflash-internal-skills/evals/summary_infeasible.json new file mode 100644 index 000000000..f15d9fcbf --- /dev/null +++ b/tiles/codeflash-internal-skills/evals/summary_infeasible.json @@ -0,0 +1,21 @@ +{ + "package_name": "codeflash-internal-skills", + "total_infeasible": 3, + "infeasible_capabilities": [ + { + "capability": "End-to-end LLM call with real API keys", + "reason": "Testing actual LLM calls (Step 4 of debug-optimization-failure, Step 5 of debug-test-generation) requires live API keys for Azure OpenAI and Anthropic Foundry. Eval environments cannot safely provision these credentials, and mocking would defeat the purpose of testing the real integration.", + "skills_affected": ["debug-optimization-failure", "debug-test-generation"] + }, + { + "capability": "PostHog event tracking verification", + "reason": "Step 7 of debug-optimization-failure references PostHog events (aiservice-optimize-openai-usage). Verifying that events are sent correctly requires a PostHog instance and API access, which is not available in eval environments.", + "skills_affected": ["debug-optimization-failure"] + }, + { + "capability": "Sentry error capture verification", + "reason": "Step 6 of debug-optimization-failure references Sentry message capture when candidates fail to parse. Verifying Sentry integration requires a live Sentry DSN and cannot be tested in isolation without the full Django runtime.", + "skills_affected": ["debug-optimization-failure"] + } + ] +}